From 003024776adbdee6e1473ee6a88b81aea6f8f4f3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:00:48 -0800 Subject: [PATCH 001/134] ACPI: NUMA: Up-level "map to online node" functionality commit b2ca916ce392a9d4cea3489a3efb2b627b839eaf upstream. The acpi_map_pxm_to_online_node() helper is used to find the closest online node to a given proximity domain. This is used to map devices in a proximity domain with no online memory or cpus to the closest online node and populate a device's 'numa_node' property. The numa_node property allows applications to be migrated "close" to a resource. In preparation for providing a generic facility to optionally map an address range to its closest online node, or the node the range would represent were it to be onlined (target_node), up-level the core of acpi_map_pxm_to_online_node() to a generic mm/numa helper. Intel-SIG: commit b2ca916ce392 ACPI: NUMA: Up-level "map to online node" functionality Backport for SGX virtualization support on Intel Xeon platform. Cc: Michal Hocko Acked-by: Rafael J. Wysocki Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/158188324802.894464.13128795207831894206.stgit@dwillia2-desk3.amr.corp.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- drivers/acpi/numa/srat.c | 41 ---------------------------------------- include/linux/acpi.h | 23 +++++++++++++++++++++- include/linux/numa.h | 9 +++++++++ mm/mempolicy.c | 30 +++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 42 deletions(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 85e01752fbe4..b6b54d1d2539 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -72,47 +72,6 @@ int acpi_map_pxm_to_node(int pxm) } EXPORT_SYMBOL(acpi_map_pxm_to_node); -/** - * acpi_map_pxm_to_online_node - Map proximity ID to online node - * @pxm: ACPI proximity ID - * - * This is similar to acpi_map_pxm_to_node(), but always returns an online - * node. When the mapped node from a given proximity ID is offline, it - * looks up the node distance table and returns the nearest online node. - * - * ACPI device drivers, which are called after the NUMA initialization has - * completed in the kernel, can call this interface to obtain their device - * NUMA topology from ACPI tables. Such drivers do not have to deal with - * offline nodes. A node may be offline when a device proximity ID is - * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. - * "numa=off" on x86. - */ -int acpi_map_pxm_to_online_node(int pxm) -{ - int node, min_node; - - node = acpi_map_pxm_to_node(pxm); - - if (node == NUMA_NO_NODE) - node = 0; - - min_node = node; - if (!node_online(node)) { - int min_dist = INT_MAX, dist, n; - - for_each_online_node(n) { - dist = node_distance(node, n); - if (dist < min_dist) { - min_dist = dist; - min_node = n; - } - } - } - - return min_node; -} -EXPORT_SYMBOL(acpi_map_pxm_to_online_node); - static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dd0f2e4b79a7..6055725185e7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -421,9 +421,30 @@ extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA -int acpi_map_pxm_to_online_node(int pxm); int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); + +/** + * acpi_map_pxm_to_online_node - Map proximity ID to online node + * @pxm: ACPI proximity ID + * + * This is similar to acpi_map_pxm_to_node(), but always returns an online + * node. When the mapped node from a given proximity ID is offline, it + * looks up the node distance table and returns the nearest online node. + * + * ACPI device drivers, which are called after the NUMA initialization has + * completed in the kernel, can call this interface to obtain their device + * NUMA topology from ACPI tables. Such drivers do not have to deal with + * offline nodes. A node may be offline when a device proximity ID is + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. + * "numa=off" on x86. + */ +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + int node = acpi_map_pxm_to_node(pxm); + + return numa_map_to_online_node(node); +} #else static inline int acpi_map_pxm_to_online_node(int pxm) { diff --git a/include/linux/numa.h b/include/linux/numa.h index 110b0e5d0fb0..20f4e44b186c 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -13,4 +13,13 @@ #define NUMA_NO_NODE (-1) +#ifdef CONFIG_NUMA +int numa_map_to_online_node(int node); +#else +static inline int numa_map_to_online_node(int node) +{ + return NUMA_NO_NODE; +} +#endif + #endif /* _LINUX_NUMA_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 87d165923fee..ea3962fe74bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -127,6 +127,36 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/** + * numa_map_to_online_node - Find closest online node + * @nid: Node id to start the search + * + * Lookup the next closest node by distance if @nid is not online. + */ +int numa_map_to_online_node(int node) +{ + int min_node; + + if (node == NUMA_NO_NODE) + node = 0; + + min_node = node; + if (!node_online(node)) { + int min_dist = INT_MAX, dist, n; + + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_map_to_online_node); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; -- Gitee From ba251ddeca6b28fd581ec99ab15145f0d5d03016 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:00:53 -0800 Subject: [PATCH 002/134] mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node() commit 4fcbe96e4d0bc4abe22ee10573ff663b9ebbcf17 upstream. Update numa_map_to_online_node() to stop falling back to numa node 0 when the input is NUMA_NO_NODE. Also, skip the lookup if @node is online. This makes the routine compatible with other arch node mapping routines. Intel-SIG: commit 4fcbe96e4d0b mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node() Backport for SGX virtualization support on Intel Xeon platform. Reported-by: Aneesh Kumar K.V Reviewed-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/157401275716.43284.13185549705765009174.stgit@dwillia2-desk3.amr.corp.intel.com Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/158188325316.894464.15650888748083329531.stgit@dwillia2-desk3.amr.corp.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- mm/mempolicy.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ea3962fe74bd..df50d57e41a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -135,21 +135,17 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; */ int numa_map_to_online_node(int node) { - int min_node; + int min_dist = INT_MAX, dist, n, min_node; - if (node == NUMA_NO_NODE) - node = 0; + if (node == NUMA_NO_NODE || node_online(node)) + return node; min_node = node; - if (!node_online(node)) { - int min_dist = INT_MAX, dist, n; - - for_each_online_node(n) { - dist = node_distance(node, n); - if (dist < min_dist) { - min_dist = dist; - min_node = n; - } + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; } } -- Gitee From 7f9c447a50bc2e9ce8fc37f0306ce77a3be993fe Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:04 -0800 Subject: [PATCH 003/134] x86/mm: Introduce CONFIG_NUMA_KEEP_MEMINFO commit 1e5d8e1e47afde23e3249aed25d7d124feff5c1c upstream. Currently x86 numa_meminfo is marked __initdata in the CONFIG_MEMORY_HOTPLUG=n case. In support of a new facility to allow drivers to map reserved memory to a 'target_node' (phys_to_target_node()), add support for removing the __initdata designation for those users. Both memory hotplug and phys_to_target_node() users select CONFIG_NUMA_KEEP_MEMINFO to tell the arch to maintain its physical address to NUMA mapping infrastructure post init. Intel-SIG: commit 1e5d8e1e47af x86/mm: Introduce CONFIG_NUMA_KEEP_MEMINFO Backport for SGX virtualization support on Intel Xeon platform. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326422.894464.15742054998046628934.stgit@dwillia2-desk3.amr.corp.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/mm/numa.c | 6 +----- include/linux/numa.h | 7 +++++++ mm/Kconfig | 5 +++++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4123100e0eaf..68585f42e76f 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -25,11 +25,7 @@ nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static struct numa_meminfo numa_meminfo -#ifndef CONFIG_MEMORY_HOTPLUG -__initdata -#endif -; +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; diff --git a/include/linux/numa.h b/include/linux/numa.h index 20f4e44b186c..5773cd2613fc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -13,6 +13,13 @@ #define NUMA_NO_NODE (-1) +/* optionally keep NUMA memory info available post init */ +#ifdef CONFIG_NUMA_KEEP_MEMINFO +#define __initdata_or_meminfo +#else +#define __initdata_or_meminfo __initdata +#endif + #ifdef CONFIG_NUMA int numa_map_to_online_node(int node); #else diff --git a/mm/Kconfig b/mm/Kconfig index fbdc5c70e487..68ce480e50d3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -139,6 +139,10 @@ config HAVE_FAST_GUP config ARCH_KEEP_MEMBLOCK bool +# Keep arch NUMA mapping infrastructure post-init. +config NUMA_KEEP_MEMINFO + bool + config MEMORY_ISOLATION bool @@ -154,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE def_bool y -- Gitee From 94096826d288b14067cb655fc196aa3bf01b8439 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:09 -0800 Subject: [PATCH 004/134] x86/NUMA: Provide a range-to-target_node lookup facility commit 5d30f92e7631286b8617777c5400c8eadcae50a1 upstream. The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax instances, fronting performance-differentiated-memory like pmem, to be added to the System RAM pool. The NUMA node for that hot-added memory is derived from the device-dax instance's 'target_node' attribute. Recall that the 'target_node' is the ACPI-PXM-to-node translation for memory when it comes online whereas the 'numa_node' attribute of the device represents the closest online cpu node. Presently useful target_node information from the ACPI SRAT is discarded with the expectation that "Reserved" memory will never be onlined. Now, DEV_DAX_KMEM violates that assumption, there is a need to retain the translation. Move, rather than discard, numa_memblk data to a secondary array that memory_add_physaddr_to_target_node() may consider at a later point in time. Intel-SIG: commit 5d30f92e7631 x86/NUMA: Provide a range-to-target_node lookup facility Backport for SGX virtualization support on Intel Xeon platform. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reported-by: kbuild test robot Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/mm/numa.c | 61 ++++++++++++++++++++++++++++++++++++-------- include/linux/numa.h | 14 +++++++++- 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 68585f42e76f..0dec124ae06d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -26,6 +26,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; @@ -164,6 +165,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -233,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - /* make sure all blocks are inside the limits */ + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty or non-exist block */ - if (bi->start >= bi->end || - !memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) + /* and there's no empty block */ + if (bi->start >= bi->end) numa_remove_memblk_from(i--, mi); } @@ -877,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 start) +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) { - struct numa_meminfo *mi = &numa_meminfo; - int nid = mi->blk[0].nid; int i; for (i = 0; i < mi->nr_blks; i++) if (mi->blk[i].start <= start && mi->blk[i].end > start) - nid = mi->blk[i].nid; + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(phys_addr_t start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); diff --git a/include/linux/numa.h b/include/linux/numa.h index 5773cd2613fc..a42df804679e 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H - +#include #ifdef CONFIG_NODES_SHIFT #define NODES_SHIFT CONFIG_NODES_SHIFT @@ -21,12 +21,24 @@ #endif #ifdef CONFIG_NUMA +/* Generic implementation available */ int numa_map_to_online_node(int node); + +/* + * Optional architecture specific implementation, users need a "depends + * on $ARCH" + */ +int phys_to_target_node(phys_addr_t addr); #else static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } + +static inline int phys_to_target_node(phys_addr_t addr) +{ + return NUMA_NO_NODE; +} #endif #endif /* _LINUX_NUMA_H */ -- Gitee From 2a5ec6c3ea5c39fefc9df0e0b9dfecb0b6763a74 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Mar 2020 16:12:51 -0700 Subject: [PATCH 005/134] selftests/harness: Move test child waiting logic commit f46f576280595f7c8feba454526b95a54e635798 upstream. In order to better handle timeout failures, rearrange the child waiting logic into a separate function. This is mostly a copy/paste with an indentation change. To handle pid tracking, a new field is added for the child pid. Also move the alarm() pairing into the function. Intel-SIG: commit f46f57628059 selftests/harness: Move test child waiting logic Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Kees Cook Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 93 +++++++++++---------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 5336b26506ab..c7b67e379219 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -635,6 +635,7 @@ struct __test_metadata { const char *name; void (*fn)(struct __test_metadata *); + pid_t pid; /* pid of test when being run */ int termsig; int passed; int trigger; /* extra handler after the evaluation */ @@ -695,64 +696,68 @@ static inline int __bail(int for_realz, bool no_print, __u8 step) return 0; } -void __run_test(struct __test_metadata *t) +void __wait_for_test(struct __test_metadata *t) { - pid_t child_pid; int status; + alarm(t->timeout); + waitpid(t->pid, &status, 0); + alarm(0); + + if (WIFEXITED(status)) { + t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; + if (t->termsig != -1) { + fprintf(TH_LOG_STREAM, + "%s: Test exited normally " + "instead of by signal (code: %d)\n", + t->name, + WEXITSTATUS(status)); + } else if (!t->passed) { + fprintf(TH_LOG_STREAM, + "%s: Test failed at step #%d\n", + t->name, + WEXITSTATUS(status)); + } + } else if (WIFSIGNALED(status)) { + t->passed = 0; + if (WTERMSIG(status) == SIGABRT) { + fprintf(TH_LOG_STREAM, + "%s: Test terminated by assertion\n", + t->name); + } else if (WTERMSIG(status) == t->termsig) { + t->passed = 1; + } else { + fprintf(TH_LOG_STREAM, + "%s: Test terminated unexpectedly " + "by signal %d\n", + t->name, + WTERMSIG(status)); + } + } else { + fprintf(TH_LOG_STREAM, + "%s: Test ended in some other way [%u]\n", + t->name, + status); + } +} + +void __run_test(struct __test_metadata *t) +{ t->passed = 1; t->trigger = 0; printf("[ RUN ] %s\n", t->name); - alarm(t->timeout); - child_pid = fork(); - if (child_pid < 0) { + t->pid = fork(); + if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; - } else if (child_pid == 0) { + } else if (t->pid == 0) { t->fn(t); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { - /* TODO(wad) add timeout support. */ - waitpid(child_pid, &status, 0); - if (WIFEXITED(status)) { - t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; - if (t->termsig != -1) { - fprintf(TH_LOG_STREAM, - "%s: Test exited normally " - "instead of by signal (code: %d)\n", - t->name, - WEXITSTATUS(status)); - } else if (!t->passed) { - fprintf(TH_LOG_STREAM, - "%s: Test failed at step #%d\n", - t->name, - WEXITSTATUS(status)); - } - } else if (WIFSIGNALED(status)) { - t->passed = 0; - if (WTERMSIG(status) == SIGABRT) { - fprintf(TH_LOG_STREAM, - "%s: Test terminated by assertion\n", - t->name); - } else if (WTERMSIG(status) == t->termsig) { - t->passed = 1; - } else { - fprintf(TH_LOG_STREAM, - "%s: Test terminated unexpectedly " - "by signal %d\n", - t->name, - WTERMSIG(status)); - } - } else { - fprintf(TH_LOG_STREAM, - "%s: Test ended in some other way [%u]\n", - t->name, - status); - } + __wait_for_test(t); } printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); - alarm(0); } static int test_harness_run(int __attribute__((unused)) argc, -- Gitee From 000e3628c6f722db7ae895128aaea85c393237ec Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Mar 2020 16:12:52 -0700 Subject: [PATCH 006/134] selftests/harness: Handle timeouts cleanly commit c31801da6e3d7ef6115a0b5af1816a8f7ab120c2 upstream. When a selftest would timeout before, the program would just fall over and no accounting of failures would be reported (i.e. it would result in an incomplete TAP report). Instead, add an explicit SIGALRM handler to cleanly catch and report the timeout. Before: [==========] Running 2 tests from 2 test cases. [ RUN ] timeout.finish [ OK ] timeout.finish [ RUN ] timeout.too_long Alarm clock After: [==========] Running 2 tests from 2 test cases. [ RUN ] timeout.finish [ OK ] timeout.finish [ RUN ] timeout.too_long timeout.too_long: Test terminated by timeout [ FAIL ] timeout.too_long [==========] 1 / 2 tests passed. [ FAILED ] Intel-SIG: commit c31801da6e3d selftests/harness: Handle timeouts cleanly Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Kees Cook Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 53 ++++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index c7b67e379219..2902f6a78f8a 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -639,7 +639,8 @@ struct __test_metadata { int termsig; int passed; int trigger; /* extra handler after the evaluation */ - int timeout; + int timeout; /* seconds to wait for test timeout */ + bool timed_out; /* did this test timeout instead of exiting? */ __u8 step; bool no_print; /* manual trigger when TH_LOG_STREAM is not available */ struct __test_metadata *prev, *next; @@ -696,15 +697,63 @@ static inline int __bail(int for_realz, bool no_print, __u8 step) return 0; } +struct __test_metadata *__active_test; +static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) +{ + struct __test_metadata *t = __active_test; + + /* Sanity check handler execution environment. */ + if (!t) { + fprintf(TH_LOG_STREAM, + "no active test in SIGARLM handler!?\n"); + abort(); + } + if (sig != SIGALRM || sig != info->si_signo) { + fprintf(TH_LOG_STREAM, + "%s: SIGALRM handler caught signal %d!?\n", + t->name, sig != SIGALRM ? sig : info->si_signo); + abort(); + } + + t->timed_out = true; + kill(t->pid, SIGKILL); +} + void __wait_for_test(struct __test_metadata *t) { + struct sigaction action = { + .sa_sigaction = __timeout_handler, + .sa_flags = SA_SIGINFO, + }; + struct sigaction saved_action; int status; + if (sigaction(SIGALRM, &action, &saved_action)) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: unable to install SIGARLM handler\n", + t->name); + return; + } + __active_test = t; + t->timed_out = false; alarm(t->timeout); waitpid(t->pid, &status, 0); alarm(0); + if (sigaction(SIGALRM, &saved_action, NULL)) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: unable to uninstall SIGARLM handler\n", + t->name); + return; + } + __active_test = NULL; - if (WIFEXITED(status)) { + if (t->timed_out) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: Test terminated by timeout\n", t->name); + } else if (WIFEXITED(status)) { t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; if (t->termsig != -1) { fprintf(TH_LOG_STREAM, -- Gitee From aec21e8003770e69e1f6da625ed5e8bf603ce337 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Mar 2020 09:06:48 +0000 Subject: [PATCH 007/134] selftests/harness: fix spelling mistake "SIGARLM" -> "SIGALRM" commit d925c896956283cf12634c4223f62ad2c080da29 upstream. There a few identical spelling mistakes, fix these. Intel-SIG: commit d925c8969562 selftests/harness: fix spelling mistake "SIGARLM" -> "SIGALRM" Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Colin Ian King Acked-by: Kees Cook Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2902f6a78f8a..2bb8c81fc0b4 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -705,7 +705,7 @@ static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) /* Sanity check handler execution environment. */ if (!t) { fprintf(TH_LOG_STREAM, - "no active test in SIGARLM handler!?\n"); + "no active test in SIGALRM handler!?\n"); abort(); } if (sig != SIGALRM || sig != info->si_signo) { @@ -731,7 +731,7 @@ void __wait_for_test(struct __test_metadata *t) if (sigaction(SIGALRM, &action, &saved_action)) { t->passed = 0; fprintf(TH_LOG_STREAM, - "%s: unable to install SIGARLM handler\n", + "%s: unable to install SIGALRM handler\n", t->name); return; } @@ -743,7 +743,7 @@ void __wait_for_test(struct __test_metadata *t) if (sigaction(SIGALRM, &saved_action, NULL)) { t->passed = 0; fprintf(TH_LOG_STREAM, - "%s: unable to uninstall SIGARLM handler\n", + "%s: unable to uninstall SIGALRM handler\n", t->name); return; } -- Gitee From da16926cd8e51eb8793a30c71c4780359d4a2710 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:47 -0700 Subject: [PATCH 008/134] kselftest: factor out list manipulation to a helper commit 1a89595c2272aa9b4cd3fda562545dc1d9cd89ed upstream. Kees suggest to factor out the list append code to a macro, since following commits need it, which leads to code duplication. Intel-SIG: commit 1a89595c2272 kselftest: factor out list manipulation to a helper Backport for SGX virtualization support on Intel Xeon platform. Suggested-by: Kees Cook Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 42 ++++++++++++--------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2bb8c81fc0b4..77f754854f0d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -631,6 +631,29 @@ } \ } while (0); OPTIONAL_HANDLER(_assert) +/* List helpers */ +#define __LIST_APPEND(head, item) \ +{ \ + /* Circular linked list where only prev is circular. */ \ + if (head == NULL) { \ + head = item; \ + item->next = NULL; \ + item->prev = item; \ + return; \ + } \ + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + item->next = NULL; \ + item->prev = head->prev; \ + item->prev->next = item; \ + head->prev = item; \ + } else { \ + item->next = head; \ + item->next->prev = item; \ + item->prev = item; \ + head = item; \ + } \ +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; @@ -667,24 +690,7 @@ static int __constructor_order; static inline void __register_test(struct __test_metadata *t) { __test_count++; - /* Circular linked list where only prev is circular. */ - if (__test_list == NULL) { - __test_list = t; - t->next = NULL; - t->prev = t; - return; - } - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { - t->next = NULL; - t->prev = __test_list->prev; - t->prev->next = t; - __test_list->prev = t; - } else { - t->next = __test_list; - t->next->prev = t; - t->prev = t; - __test_list = t; - } + __LIST_APPEND(__test_list, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) -- Gitee From f8f02cdee855b5df99f634b40a004757c775e37c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:48 -0700 Subject: [PATCH 009/134] kselftest: create fixture objects commit 142aca6b388c8ab83dc41bd71150cb23115bd285 upstream. Grouping tests by fixture will allow us to parametrize test runs. Create full objects for fixtures. Add a "global" fixture for tests without a fixture. Intel-SIG: commit 142aca6b388c kselftest: create fixture objects Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 51 +++++++++++++++------ 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 77f754854f0d..de283fd6fc4d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -169,8 +169,10 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ static struct __test_metadata _##test_name##_object = \ - { .name = "global." #test_name, \ - .fn = &test_name, .termsig = _signal, \ + { .name = #test_name, \ + .fn = &test_name, \ + .fixture = &_fixture_global, \ + .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ static void __attribute__((constructor)) _register_##test_name(void) \ { \ @@ -212,10 +214,12 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + static struct __fixture_metadata _##fixture_name##_fixture_object = \ + { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_data(void) \ { \ - __fixture_count++; \ + __register_fixture(&_##fixture_name##_fixture_object); \ } \ FIXTURE_DATA(fixture_name) @@ -309,8 +313,9 @@ } \ static struct __test_metadata \ _##fixture_name##_##test_name##_object = { \ - .name = #fixture_name "." #test_name, \ + .name = #test_name, \ .fn = &wrapper_##fixture_name##_##test_name, \ + .fixture = &_##fixture_name##_fixture_object, \ .termsig = signal, \ .timeout = tmout, \ }; \ @@ -654,11 +659,34 @@ } \ } +/* Contains all the information about a fixture. */ +struct __fixture_metadata { + const char *name; + struct __fixture_metadata *prev, *next; +} _fixture_global __attribute__((unused)) = { + .name = "global", + .prev = &_fixture_global, +}; + +static struct __fixture_metadata *__fixture_list = &_fixture_global; +static unsigned int __fixture_count; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +static inline void __register_fixture(struct __fixture_metadata *f) +{ + __fixture_count++; + __LIST_APPEND(__fixture_list, f); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; void (*fn)(struct __test_metadata *); pid_t pid; /* pid of test when being run */ + struct __fixture_metadata *fixture; int termsig; int passed; int trigger; /* extra handler after the evaluation */ @@ -672,11 +700,6 @@ struct __test_metadata { /* Storage for the (global) tests to be run. */ static struct __test_metadata *__test_list; static unsigned int __test_count; -static unsigned int __fixture_count; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 /* * Since constructors are called in reverse order, reverse the test @@ -796,11 +819,12 @@ void __wait_for_test(struct __test_metadata *t) } } -void __run_test(struct __test_metadata *t) +void __run_test(struct __fixture_metadata *f, + struct __test_metadata *t) { t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s\n", t->name); + printf("[ RUN ] %s.%s\n", f->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); @@ -812,7 +836,8 @@ void __run_test(struct __test_metadata *t) } else { __wait_for_test(t); } - printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); + printf("[ %4s ] %s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, @@ -828,7 +853,7 @@ static int test_harness_run(int __attribute__((unused)) argc, __test_count, __fixture_count + 1); for (t = __test_list; t; t = t->next) { count++; - __run_test(t); + __run_test(t->fixture, t); if (t->passed) pass_count++; else -- Gitee From a7f253fca4fe7b2394383e6dff3f475968818b3e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:49 -0700 Subject: [PATCH 010/134] kselftest: run tests by fixture commit e7f304607778e31bfd8e6b00ce2a8f990b265e14 upstream. Now that all tests have a fixture object move from a global list of tests to a list of tests per fixture. Order of tests may change as we will now group and run test fixture by fixture, rather than in declaration order. Intel-SIG: commit e7f304607778 kselftest: run tests by fixture Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 23 +++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index de283fd6fc4d..fa7185e45472 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -659,9 +659,12 @@ } \ } +struct __test_metadata; + /* Contains all the information about a fixture. */ struct __fixture_metadata { const char *name; + struct __test_metadata *tests; struct __fixture_metadata *prev, *next; } _fixture_global __attribute__((unused)) = { .name = "global", @@ -698,7 +701,6 @@ struct __test_metadata { }; /* Storage for the (global) tests to be run. */ -static struct __test_metadata *__test_list; static unsigned int __test_count; /* @@ -713,7 +715,7 @@ static unsigned int __test_count; static inline void __register_test(struct __test_metadata *t) { __test_count++; - __LIST_APPEND(__test_list, t); + __LIST_APPEND(t->fixture->tests, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) @@ -843,6 +845,7 @@ void __run_test(struct __fixture_metadata *f, static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; unsigned int count = 0; @@ -851,13 +854,15 @@ static int test_harness_run(int __attribute__((unused)) argc, /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", __test_count, __fixture_count + 1); - for (t = __test_list; t; t = t->next) { - count++; - __run_test(t->fixture, t); - if (t->passed) - pass_count++; - else - ret = 1; + for (f = __fixture_list; f; f = f->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, t); + if (t->passed) + pass_count++; + else + ret = 1; + } } printf("[==========] %u / %u tests passed.\n", pass_count, count); printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); -- Gitee From 8b9c74d6f18a253e333edb4f9103e382c475cedf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:50 -0700 Subject: [PATCH 011/134] kselftest: add fixture variants commit 74bc7c97fa88ae334752e7b45702d23813df8873 upstream. Allow users to build parameterized variants of fixtures. If fixtures want variants, they call FIXTURE_VARIANT() to declare the structure to fill for each variant. Each fixture will be re-run for each of the variants defined by calling FIXTURE_VARIANT_ADD() with the differing parameters initializing the structure. Since tests are being re-run, additional initialization (steps, no_print) is also added. Intel-SIG: commit 74bc7c97fa88 kselftest: add fixture variants Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- Documentation/dev-tools/kselftest.rst | 3 +- tools/testing/selftests/kselftest_harness.h | 148 ++++++++++++++++---- 2 files changed, 124 insertions(+), 27 deletions(-) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index ecdfdc9d4b03..1e4107627e45 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -301,7 +301,8 @@ Helpers .. kernel-doc:: tools/testing/selftests/kselftest_harness.h :functions: TH_LOG TEST TEST_SIGNAL FIXTURE FIXTURE_DATA FIXTURE_SETUP - FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN + FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN FIXTURE_VARIANT + FIXTURE_VARIANT_ADD Operators --------- diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index fa7185e45472..c9f03ef93338 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -168,9 +168,15 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ + static inline void wrapper_##test_name( \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ + { \ + test_name(_metadata); \ + } \ static struct __test_metadata _##test_name##_object = \ { .name = #test_name, \ - .fn = &test_name, \ + .fn = &wrapper_##test_name, \ .fixture = &_fixture_global, \ .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ @@ -214,6 +220,7 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + FIXTURE_VARIANT(fixture_name); \ static struct __fixture_metadata _##fixture_name##_fixture_object = \ { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ @@ -245,7 +252,10 @@ #define FIXTURE_SETUP(fixture_name) \ void fixture_name##_setup( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + /** * FIXTURE_TEARDOWN(fixture_name) * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. @@ -267,6 +277,59 @@ struct __test_metadata __attribute__((unused)) *_metadata, \ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) +/** + * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture + * to declare fixture variant + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_VARIANT(datatype name) { + * type property1; + * ... + * }; + * + * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F() + * as *variant*. Variants allow the same tests to be run with different + * arguments. + */ +#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name + +/** + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture + * variant to setup and register the data + * + * @fixture_name: fixture name + * @variant_name: name of the parameter set + * + * .. code-block:: c + * + * FIXTURE_ADD(datatype name) { + * .property1 = val1; + * ... + * }; + * + * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and + * TEST_F() as *variant*. Tests of each fixture will be run once for each + * variant. + */ +#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ + extern FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant; \ + static struct __fixture_variant_metadata \ + _##fixture_name##_##variant_name##_object = \ + { .name = #variant_name, \ + .data = &_##fixture_name##_##variant_name##_variant}; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##variant_name(void) \ + { \ + __register_fixture_variant(&_##fixture_name##_fixture_object, \ + &_##fixture_name##_##variant_name##_object); \ + } \ + FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant = + /** * TEST_F(fixture_name, test_name) - Emits test registration and helpers for * fixture-based test cases @@ -297,18 +360,20 @@ #define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \ static void fixture_name##_##test_name( \ struct __test_metadata *_metadata, \ - FIXTURE_DATA(fixture_name) *self); \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ static inline void wrapper_##fixture_name##_##test_name( \ - struct __test_metadata *_metadata) \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ FIXTURE_DATA(fixture_name) self; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ - fixture_name##_setup(_metadata, &self); \ + fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ if (!_metadata->passed) \ return; \ - fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_##test_name(_metadata, &self, variant->data); \ fixture_name##_teardown(_metadata, &self); \ } \ static struct __test_metadata \ @@ -326,7 +391,9 @@ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) /** * TEST_HARNESS_MAIN - Simple wrapper to run the test harness @@ -660,11 +727,13 @@ } struct __test_metadata; +struct __fixture_variant_metadata; /* Contains all the information about a fixture. */ struct __fixture_metadata { const char *name; struct __test_metadata *tests; + struct __fixture_variant_metadata *variant; struct __fixture_metadata *prev, *next; } _fixture_global __attribute__((unused)) = { .name = "global", @@ -672,7 +741,6 @@ struct __fixture_metadata { }; static struct __fixture_metadata *__fixture_list = &_fixture_global; -static unsigned int __fixture_count; static int __constructor_order; #define _CONSTRUCTOR_ORDER_FORWARD 1 @@ -680,14 +748,27 @@ static int __constructor_order; static inline void __register_fixture(struct __fixture_metadata *f) { - __fixture_count++; __LIST_APPEND(__fixture_list, f); } +struct __fixture_variant_metadata { + const char *name; + const void *data; + struct __fixture_variant_metadata *prev, *next; +}; + +static inline void +__register_fixture_variant(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant) +{ + __LIST_APPEND(f->variant, variant); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; - void (*fn)(struct __test_metadata *); + void (*fn)(struct __test_metadata *, + struct __fixture_variant_metadata *); pid_t pid; /* pid of test when being run */ struct __fixture_metadata *fixture; int termsig; @@ -700,9 +781,6 @@ struct __test_metadata { struct __test_metadata *prev, *next; }; -/* Storage for the (global) tests to be run. */ -static unsigned int __test_count; - /* * Since constructors are called in reverse order, reverse the test * list so tests are run in source declaration order. @@ -714,7 +792,6 @@ static unsigned int __test_count; */ static inline void __register_test(struct __test_metadata *t) { - __test_count++; __LIST_APPEND(t->fixture->tests, t); } @@ -822,46 +899,65 @@ void __wait_for_test(struct __test_metadata *t) } void __run_test(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant, struct __test_metadata *t) { + /* reset test struct */ t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s.%s\n", f->name, t->name); + t->step = 0; + t->no_print = 0; + + printf("[ RUN ] %s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { - t->fn(t); + t->fn(t, variant); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { __wait_for_test(t); } - printf("[ %4s ] %s.%s\n", (t->passed ? "OK" : "FAIL"), - f->name, t->name); + printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_variant_metadata no_variant = { .name = "", }; + struct __fixture_variant_metadata *v; struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; + unsigned int case_count = 0, test_count = 0; unsigned int count = 0; unsigned int pass_count = 0; + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + case_count++; + for (t = f->tests; t; t = t->next) + test_count++; + } + } + /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", - __test_count, __fixture_count + 1); + test_count, case_count); for (f = __fixture_list; f; f = f->next) { - for (t = f->tests; t; t = t->next) { - count++; - __run_test(f, t); - if (t->passed) - pass_count++; - else - ret = 1; + for (v = f->variant ?: &no_variant; v; v = v->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, v, t); + if (t->passed) + pass_count++; + else + ret = 1; + } } } printf("[==========] %u / %u tests passed.\n", pass_count, count); -- Gitee From fef1ba94b699eb6a0e76a2f7af28dbaec4312565 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 22 Jun 2020 11:16:48 -0700 Subject: [PATCH 012/134] selftests/harness: Switch to TAP output commit e80068be21824e4d2726eeea41cac6178d212415 upstream. Using the kselftest_harness.h would result in non-TAP test reporting, which didn't make much sense given that all the requirements for using the low-level API were met. Switch to using ksft_*() helpers while retaining as much of a human-readability as possible. Intel-SIG: commit e80068be2182 selftests/harness: Switch to TAP output Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Kees Cook Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest.h | 5 +- tools/testing/selftests/kselftest_harness.h | 52 ++++++++++++--------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 2cd5eefed4d2..9b4efdbb07f6 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * kselftest.h: kselftest framework return codes to include from - * selftests. + * kselftest.h: low-level kselftest framework to include from + * selftest programs. When possible, please use + * kselftest_harness.h instead. * * Copyright (c) 2014 Shuah Khan * Copyright (c) 2014 Samsung Electronics Co., Ltd. diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index c9f03ef93338..f8f7e47c739a 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -50,7 +50,9 @@ #ifndef __KSELFTEST_HARNESS_H #define __KSELFTEST_HARNESS_H +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include #include #include @@ -62,6 +64,8 @@ #include #include +#include "kselftest.h" + #define TEST_TIMEOUT_DEFAULT 30 /* Utilities exposed to the test definitions */ @@ -104,7 +108,7 @@ /* Unconditional logger for internal use. */ #define __TH_LOG(fmt, ...) \ - fprintf(TH_LOG_STREAM, "%s:%d:%s:" fmt "\n", \ + fprintf(TH_LOG_STREAM, "# %s:%d:%s:" fmt "\n", \ __FILE__, __LINE__, _metadata->name, ##__VA_ARGS__) /** @@ -119,7 +123,7 @@ */ #define XFAIL(statement, fmt, ...) do { \ if (TH_LOG_ENABLED) { \ - fprintf(TH_LOG_STREAM, "[ XFAIL! ] " fmt "\n", \ + fprintf(TH_LOG_STREAM, "# XFAIL " fmt "\n", \ ##__VA_ARGS__); \ } \ /* TODO: find a way to pass xfail to test runner process. */ \ @@ -813,12 +817,12 @@ static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) /* Sanity check handler execution environment. */ if (!t) { fprintf(TH_LOG_STREAM, - "no active test in SIGALRM handler!?\n"); + "# no active test in SIGALRM handler!?\n"); abort(); } if (sig != SIGALRM || sig != info->si_signo) { fprintf(TH_LOG_STREAM, - "%s: SIGALRM handler caught signal %d!?\n", + "# %s: SIGALRM handler caught signal %d!?\n", t->name, sig != SIGALRM ? sig : info->si_signo); abort(); } @@ -839,7 +843,7 @@ void __wait_for_test(struct __test_metadata *t) if (sigaction(SIGALRM, &action, &saved_action)) { t->passed = 0; fprintf(TH_LOG_STREAM, - "%s: unable to install SIGALRM handler\n", + "# %s: unable to install SIGALRM handler\n", t->name); return; } @@ -851,7 +855,7 @@ void __wait_for_test(struct __test_metadata *t) if (sigaction(SIGALRM, &saved_action, NULL)) { t->passed = 0; fprintf(TH_LOG_STREAM, - "%s: unable to uninstall SIGALRM handler\n", + "# %s: unable to uninstall SIGALRM handler\n", t->name); return; } @@ -860,18 +864,17 @@ void __wait_for_test(struct __test_metadata *t) if (t->timed_out) { t->passed = 0; fprintf(TH_LOG_STREAM, - "%s: Test terminated by timeout\n", t->name); + "# %s: Test terminated by timeout\n", t->name); } else if (WIFEXITED(status)) { t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; if (t->termsig != -1) { fprintf(TH_LOG_STREAM, - "%s: Test exited normally " - "instead of by signal (code: %d)\n", + "# %s: Test exited normally instead of by signal (code: %d)\n", t->name, WEXITSTATUS(status)); } else if (!t->passed) { fprintf(TH_LOG_STREAM, - "%s: Test failed at step #%d\n", + "# %s: Test failed at step #%d\n", t->name, WEXITSTATUS(status)); } @@ -879,20 +882,19 @@ void __wait_for_test(struct __test_metadata *t) t->passed = 0; if (WTERMSIG(status) == SIGABRT) { fprintf(TH_LOG_STREAM, - "%s: Test terminated by assertion\n", + "# %s: Test terminated by assertion\n", t->name); } else if (WTERMSIG(status) == t->termsig) { t->passed = 1; } else { fprintf(TH_LOG_STREAM, - "%s: Test terminated unexpectedly " - "by signal %d\n", + "# %s: Test terminated unexpectedly by signal %d\n", t->name, WTERMSIG(status)); } } else { fprintf(TH_LOG_STREAM, - "%s: Test ended in some other way [%u]\n", + "# %s: Test ended in some other way [%u]\n", t->name, status); } @@ -908,11 +910,11 @@ void __run_test(struct __fixture_metadata *f, t->step = 0; t->no_print = 0; - printf("[ RUN ] %s%s%s.%s\n", + ksft_print_msg(" RUN %s%s%s.%s ...\n", f->name, variant->name[0] ? "." : "", variant->name, t->name); t->pid = fork(); if (t->pid < 0) { - printf("ERROR SPAWNING TEST CHILD\n"); + ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { t->fn(t, variant); @@ -921,7 +923,9 @@ void __run_test(struct __fixture_metadata *f, } else { __wait_for_test(t); } - printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + ksft_print_msg(" %4s %s%s%s.%s\n", t->passed ? "OK" : "FAIL", + f->name, variant->name[0] ? "." : "", variant->name, t->name); + ksft_test_result(t->passed, "%s%s%s.%s\n", f->name, variant->name[0] ? "." : "", variant->name, t->name); } @@ -945,8 +949,9 @@ static int test_harness_run(int __attribute__((unused)) argc, } } - /* TODO(wad) add optional arguments similar to gtest. */ - printf("[==========] Running %u tests from %u test cases.\n", + ksft_print_header(); + ksft_set_plan(test_count); + ksft_print_msg("Starting %u tests from %u test cases.\n", test_count, case_count); for (f = __fixture_list; f; f = f->next) { for (v = f->variant ?: &no_variant; v; v = v->next) { @@ -960,9 +965,12 @@ static int test_harness_run(int __attribute__((unused)) argc, } } } - printf("[==========] %u / %u tests passed.\n", pass_count, count); - printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); - return ret; + ksft_print_msg("%s: %u / %u tests passed.\n", ret ? "FAILED" : "PASSED", + pass_count, count); + ksft_exit(ret == 0); + + /* unreachable */ + return KSFT_FAIL; } static void __attribute__((constructor)) __constructor_order_first(void) -- Gitee From 09ece9dd453dec1d2b909745e94a3714cac628f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 22 Jun 2020 11:16:49 -0700 Subject: [PATCH 013/134] selftests/harness: Refactor XFAIL into SKIP commit 9847d24af95c7fa146c9a0e82505a78e29161c10 upstream. Plumb the old XFAIL result into a TAP SKIP. Intel-SIG: commit 9847d24af95c selftests/harness: Refactor XFAIL into SKIP Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Kees Cook Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log and resolve the conflict. Some chunks in tools/testing/selftests/seccomp/seccomp_bpf.c, TEST(user_notification_continue) has not been introduced on v5.4, so discarding these changes. ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/kselftest_harness.h | 64 ++++++++++++++----- tools/testing/selftests/seccomp/seccomp_bpf.c | 4 +- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index f8f7e47c739a..b519765904a6 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -112,22 +112,22 @@ __FILE__, __LINE__, _metadata->name, ##__VA_ARGS__) /** - * XFAIL(statement, fmt, ...) + * SKIP(statement, fmt, ...) * - * @statement: statement to run after reporting XFAIL + * @statement: statement to run after reporting SKIP * @fmt: format string * @...: optional arguments * - * This forces a "pass" after reporting a failure with an XFAIL prefix, + * This forces a "pass" after reporting why something is being skipped * and runs "statement", which is usually "return" or "goto skip". */ -#define XFAIL(statement, fmt, ...) do { \ +#define SKIP(statement, fmt, ...) do { \ if (TH_LOG_ENABLED) { \ - fprintf(TH_LOG_STREAM, "# XFAIL " fmt "\n", \ + fprintf(TH_LOG_STREAM, "# SKIP " fmt "\n", \ ##__VA_ARGS__); \ } \ - /* TODO: find a way to pass xfail to test runner process. */ \ _metadata->passed = 1; \ + _metadata->skip = 1; \ _metadata->trigger = 0; \ statement; \ } while (0) @@ -777,6 +777,7 @@ struct __test_metadata { struct __fixture_metadata *fixture; int termsig; int passed; + int skip; /* did SKIP get used? */ int trigger; /* extra handler after the evaluation */ int timeout; /* seconds to wait for test timeout */ bool timed_out; /* did this test timeout instead of exiting? */ @@ -866,17 +867,31 @@ void __wait_for_test(struct __test_metadata *t) fprintf(TH_LOG_STREAM, "# %s: Test terminated by timeout\n", t->name); } else if (WIFEXITED(status)) { - t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; if (t->termsig != -1) { + t->passed = 0; fprintf(TH_LOG_STREAM, "# %s: Test exited normally instead of by signal (code: %d)\n", t->name, WEXITSTATUS(status)); - } else if (!t->passed) { - fprintf(TH_LOG_STREAM, - "# %s: Test failed at step #%d\n", - t->name, - WEXITSTATUS(status)); + } else { + switch (WEXITSTATUS(status)) { + /* Success */ + case 0: + t->passed = 1; + break; + /* SKIP */ + case 255: + t->passed = 1; + t->skip = 1; + break; + /* Other failure, assume step report. */ + default: + t->passed = 0; + fprintf(TH_LOG_STREAM, + "# %s: Test failed at step #%d\n", + t->name, + WEXITSTATUS(status)); + } } } else if (WIFSIGNALED(status)) { t->passed = 0; @@ -906,6 +921,7 @@ void __run_test(struct __fixture_metadata *f, { /* reset test struct */ t->passed = 1; + t->skip = 0; t->trigger = 0; t->step = 0; t->no_print = 0; @@ -918,15 +934,31 @@ void __run_test(struct __fixture_metadata *f, t->passed = 0; } else if (t->pid == 0) { t->fn(t, variant); - /* return the step that failed or 0 */ - _exit(t->passed ? 0 : t->step); + /* Make sure step doesn't get lost in reporting */ + if (t->step >= 255) { + ksft_print_msg("Too many test steps (%u)!?\n", t->step); + t->step = 254; + } + /* Use 255 for SKIP */ + if (t->skip) + _exit(255); + /* Pass is exit 0 */ + if (t->passed) + _exit(0); + /* Something else happened, report the step. */ + _exit(t->step); } else { __wait_for_test(t); } ksft_print_msg(" %4s %s%s%s.%s\n", t->passed ? "OK" : "FAIL", f->name, variant->name[0] ? "." : "", variant->name, t->name); - ksft_test_result(t->passed, "%s%s%s.%s\n", - f->name, variant->name[0] ? "." : "", variant->name, t->name); + + if (t->skip) + ksft_test_result_skip("%s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); + else + ksft_test_result(t->passed, "%s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 19c7351eeb74..43f9d08f95d3 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3001,7 +3001,7 @@ TEST(get_metadata) /* Only real root can get metadata. */ if (geteuid()) { - XFAIL(return, "get_metadata requires real root"); + SKIP(return, "get_metadata requires real root"); return; } @@ -3044,7 +3044,7 @@ TEST(get_metadata) ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md); EXPECT_EQ(sizeof(md), ret) { if (errno == EINVAL) - XFAIL(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)"); + SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)"); } EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG); -- Gitee From 2cd804602fca1a0d5395c0234fb20917cb510ddc Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:14 -0700 Subject: [PATCH 014/134] mmap locking API: initial implementation as rwsem wrappers commit 9740ca4e95b43b91a4a848694a20d01ba6818f7b upstream. This patch series adds a new mmap locking API replacing the existing mmap_sem lock and unlocks. Initially the API is just implemente in terms of inlined rwsem calls, so it doesn't provide any new functionality. There are two justifications for the new API: - At first, it provides an easy hooking point to instrument mmap_sem locking latencies independently of any other rwsems. - In the future, it may be a starting point for replacing the rwsem implementation with a different one, such as range locks. This is something that is being explored, even though there is no wide concensus about this possible direction yet. (see https://patchwork.kernel.org/cover/11401483/) This patch (of 12): This change wraps the existing mmap_sem related rwsem calls into a new mmap locking API. There are two justifications for the new API: - At first, it provides an easy hooking point to instrument mmap_sem locking latencies independently of any other rwsems. - In the future, it may be a starting point for replacing the rwsem implementation with a different one, such as range locks. Intel-SIG: commit 9740ca4e95b4 mmap locking API: initial implementation as rwsem wrappers Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Davidlohr Bueso Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Cc: Peter Zijlstra Cc: Matthew Wilcox Cc: Liam Howlett Cc: Jerome Glisse Cc: David Rientjes Cc: Hugh Dickins Cc: Ying Han Cc: Jason Gunthorpe Cc: John Hubbard Cc: Michel Lespinasse Link: http://lkml.kernel.org/r/20200520052908.204642-1-walken@google.com Link: http://lkml.kernel.org/r/20200520052908.204642-2-walken@google.com Signed-off-by: Linus Torvalds [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- include/linux/mm.h | 1 + include/linux/mmap_lock.h | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 include/linux/mmap_lock.h diff --git a/include/linux/mm.h b/include/linux/mm.h index 1cc662ad866b..cafada5bd593 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h new file mode 100644 index 000000000000..97ac53b66052 --- /dev/null +++ b/include/linux/mmap_lock.h @@ -0,0 +1,54 @@ +#ifndef _LINUX_MMAP_LOCK_H +#define _LINUX_MMAP_LOCK_H + +static inline void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_sem); +} + +static inline void mmap_write_lock(struct mm_struct *mm) +{ + down_write(&mm->mmap_sem); +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + return down_write_killable(&mm->mmap_sem); +} + +static inline bool mmap_write_trylock(struct mm_struct *mm) +{ + return down_write_trylock(&mm->mmap_sem) != 0; +} + +static inline void mmap_write_unlock(struct mm_struct *mm) +{ + up_write(&mm->mmap_sem); +} + +static inline void mmap_write_downgrade(struct mm_struct *mm) +{ + downgrade_write(&mm->mmap_sem); +} + +static inline void mmap_read_lock(struct mm_struct *mm) +{ + down_read(&mm->mmap_sem); +} + +static inline int mmap_read_lock_killable(struct mm_struct *mm) +{ + return down_read_killable(&mm->mmap_sem); +} + +static inline bool mmap_read_trylock(struct mm_struct *mm) +{ + return down_read_trylock(&mm->mmap_sem) != 0; +} + +static inline void mmap_read_unlock(struct mm_struct *mm) +{ + up_read(&mm->mmap_sem); +} + +#endif /* _LINUX_MMAP_LOCK_H */ -- Gitee From 9843f77ef0cedf73de022223d461a70c9d328943 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:40 -0700 Subject: [PATCH 015/134] mm, memcg: rework remote charging API to support nesting commit b87d8cefe43c7f22e8aa13919c1dfa2b4b4b4e01 upstream. Currently the remote memcg charging API consists of two functions: memalloc_use_memcg() and memalloc_unuse_memcg(), which set and clear the memcg value, which overwrites the memcg of the current task. memalloc_use_memcg(target_memcg); <...> memalloc_unuse_memcg(); It works perfectly for allocations performed from a normal context, however an attempt to call it from an interrupt context or just nest two remote charging blocks will lead to an incorrect accounting. On exit from the inner block the active memcg will be cleared instead of being restored. memalloc_use_memcg(target_memcg); memalloc_use_memcg(target_memcg_2); <...> memalloc_unuse_memcg(); Error: allocation here are charged to the memcg of the current process instead of target_memcg. memalloc_unuse_memcg(); This patch extends the remote charging API by switching to a single function: struct mem_cgroup *set_active_memcg(struct mem_cgroup *memcg), which sets the new value and returns the old one. So a remote charging block will look like: old_memcg = set_active_memcg(target_memcg); <...> set_active_memcg(old_memcg); This patch is heavily based on the patch by Johannes Weiner, which can be found here: https://lkml.org/lkml/2020/5/28/806 . Intel-SIG: commit b87d8cefe43c mm, memcg: rework remote charging API to support nesting Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Dan Schatzberg Link: https://lkml.kernel.org/r/20200821212056.3769116-1-guro@fb.com Signed-off-by: Linus Torvalds [ Zhiquan Li: amend commit log and resolve the conflict. The memalloc_use_memcg/memalloc_unuse_memcg() call in mm/memcontrol.c have not been introduced in v5.4, so discards the changes. Since the APIs have been changed to set_active_memcg(), it will be noticed if old APIs call be backported in future. ] Signed-off-by: Zhiquan Li --- fs/buffer.c | 6 +++--- fs/notify/fanotify/fanotify.c | 5 +++-- fs/notify/inotify/inotify_fsnotify.c | 5 +++-- include/linux/sched/mm.h | 30 ++++++++++------------------ 4 files changed, 19 insertions(+), 27 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 0d7bd7712076..edabb81cbe3a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -817,13 +817,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; if (retry) gfp |= __GFP_NOFAIL; memcg = get_mem_cgroup_from_page(page); - memalloc_use_memcg(memcg); + old_memcg = set_active_memcg(memcg); head = NULL; offset = PAGE_SIZE; @@ -842,7 +842,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, set_bh_page(bh, page, offset); } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); mem_cgroup_put(memcg); return head; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d24548ed31b9..ee6b920443c8 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -290,6 +290,7 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(inode, mask, data, data_type); + struct mem_cgroup *old_memcg; /* * For queues with unlimited length lost events are not expected and @@ -303,7 +304,7 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event *pevent; @@ -345,7 +346,7 @@ init: __maybe_unused event->path.dentry = NULL; } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); return event; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 589dee962993..126367db52af 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -68,6 +68,7 @@ int inotify_handle_event(struct fsnotify_group *group, int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); + struct mem_cgroup *old_memcg; if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info))) return 0; @@ -95,9 +96,9 @@ int inotify_handle_event(struct fsnotify_group *group, * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (unlikely(!event)) { /* diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c9fef831ba21..f939848eee57 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -313,38 +313,28 @@ static inline void memalloc_nocma_restore(unsigned int flags) #ifdef CONFIG_MEMCG /** - * memalloc_use_memcg - Starts the remote memcg charging scope. + * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * - * NOTE: This function is not nesting safe. + * NOTE: This function can nest. Users must save the return value and + * reset the previous value after their own charging scope is over. */ -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { - WARN_ON_ONCE(current->active_memcg); + struct mem_cgroup *old = current->active_memcg; current->active_memcg = memcg; -} - -/** - * memalloc_unuse_memcg - Ends the remote memcg charging scope. - * - * This function marks the end of the remote memcg charging scope started by - * memalloc_use_memcg(). - */ -static inline void memalloc_unuse_memcg(void) -{ - current->active_memcg = NULL; + return old; } #else -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) -{ -} - -static inline void memalloc_unuse_memcg(void) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { + return NULL; } #endif -- Gitee From 22b194677cc8d4e14a9b2550b2a66a694e768e4e Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 25 Jan 2022 19:52:23 +0800 Subject: [PATCH 016/134] KVM: x86/cpuid: Exclude unpermitted xfeatures sizes at KVM_GET_SUPPORTED_CPUID commit 1ffce0924a8c86cf0590c039cd5f5c9375d32e9b upstream. With the help of xstate_get_guest_group_perm(), KVM can exclude unpermitted xfeatures in cpuid.0xd.0.eax, in which case the corresponding xfeatures sizes should also be matched to the permitted xfeatures. To fix this inconsistency, the permitted_xcr0 and permitted_xss are defined consistently, which implies 'supported' plus certain permissions for this task, and it also fixes cpuid.0xd.1.ebx and later leaf-by-leaf queries. Intel-SIG: commit 1ffce0924a8c KVM: x86/cpuid: Exclude unpermitted xfeatures sizes at KVM_GET_SUPPORTED_CPUID Backport for SGX virtualization support on Intel Xeon platform. Fixes: 445ecdf79be0 ("kvm: x86: Exclude unpermitted xfeatures at KVM_GET_SUPPORTED_CPUID") Signed-off-by: Like Xu Message-Id: <20220125115223.33707-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. Fix Qemu crash issue when launching VM: qemu-system-x86_64: ../target/i386/kvm/kvm.c:1995: kvm_arch_init_vcpu: Assertion `kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <= . Aborted (core dumped) ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 4d90f3cb4b46..c22b913ab541 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -689,14 +689,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; case 0xd: { int idx; - u64 supported = kvm_supported_xcr0(); - u64 guest_perm = xstate_get_guest_group_perm(); + u64 permitted_xcr0 = kvm_supported_xcr0() & xstate_get_guest_group_perm(); + u64 permitted_xss = supported_xss; - entry->eax &= supported & guest_perm; - entry->ebx = xstate_required_size(supported, false); + entry->eax &= permitted_xcr0; + entry->ebx = xstate_required_size(permitted_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= (supported & guest_perm) >> 32; - if (!supported) + entry->edx &= permitted_xcr0 >> 32; + if (!permitted_xcr0) break; entry = do_host_cpuid(array, function, 1); @@ -705,20 +705,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_D_1_EAX); if (entry->eax & (F(XSAVES)|F(XSAVEC))) - entry->ebx = xstate_required_size(supported | supported_xss, + entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { - WARN_ON_ONCE(supported_xss != 0); + WARN_ON_ONCE(permitted_xss != 0); entry->ebx = 0; } - entry->ecx &= supported_xss; - entry->edx &= supported_xss >> 32; + entry->ecx &= permitted_xss; + entry->edx &= permitted_xss >> 32; for (idx = 2; idx < 64; ++idx) { bool s_state; - if (supported & BIT_ULL(idx)) + if (permitted_xcr0 & BIT_ULL(idx)) s_state = false; - else if (supported_xss & BIT_ULL(idx)) + else if (permitted_xss & BIT_ULL(idx)) s_state = true; else continue; @@ -732,7 +732,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * invalid sub-leafs. Only valid sub-leafs should * reach this point, and they should have a non-zero * save state size. Furthermore, check whether the - * processor agrees with supported_xcr0/supported_xss + * processor agrees with permitted_xcr0/permitted_xss * on whether this is an XCR0- or IA32_XSS-managed area. */ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { -- Gitee From 608e6b4b02eb1205fe93cb1218fb0f167d3e1fb3 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 18 Mar 2021 01:53:30 +0200 Subject: [PATCH 017/134] x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list commit 51ab30eb2ad4c4a61f827dc18863cd70dc46dc32 upstream. During normal runtime, the "ksgxd" daemon behaves like a version of kswapd just for SGX. But, before it starts acting like kswapd, its first job is to initialize enclave memory. Currently, the SGX boot code places each enclave page on a epc_section->init_laundry_list. Once it starts up, the ksgxd code walks over that list and populates the actual SGX page allocator. However, the per-section structures are going away to make way for the SGX NUMA allocator. There's also little need to have a per-section structure; the enclave pages are all treated identically, and they can be placed on the correct allocator list from metadata stored in the enclave page (struct sgx_epc_page) itself. Modify sgx_sanitize_section() to take a single page list instead of taking a section and deriving the list from there. Intel-SIG: commit 51ab30eb2ad4 x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list Incremental backporting patches for SGX on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20210317235332.362001-1-jarkko.sakkinen@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 54 ++++++++++++++++------------------ arch/x86/kernel/cpu/sgx/sgx.h | 7 ----- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 010fea31942e..415244a960cf 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -26,39 +26,43 @@ static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); +static LIST_HEAD(sgx_dirty_page_list); + /* - * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS - * pages whose child pages blocked EREMOVE. + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. */ -static void sgx_sanitize_section(struct sgx_epc_section *section) +static void __sgx_sanitize_pages(struct list_head *dirty_page_list) { struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; - /* init_laundry_list is thread-local, no need for a lock: */ - while (!list_empty(§ion->init_laundry_list)) { + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return; - /* needed for access to ->page_list: */ - spin_lock(§ion->lock); - - page = list_first_entry(§ion->init_laundry_list, - struct sgx_epc_page, list); + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); ret = __eremove(sgx_get_epc_virt_addr(page)); - if (!ret) - list_move(&page->list, §ion->page_list); - else + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); - - spin_unlock(§ion->lock); + } cond_resched(); } - list_splice(&dirty, §ion->init_laundry_list); + list_splice(&dirty, dirty_page_list); } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -405,24 +409,17 @@ static bool sgx_should_reclaim(unsigned long watermark) static int ksgxd(void *p) { - int i; - set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ - for (i = 0; i < sgx_nr_epc_sections; i++) - sgx_sanitize_section(&sgx_epc_sections[i]); - - for (i = 0; i < sgx_nr_epc_sections; i++) { - sgx_sanitize_section(&sgx_epc_sections[i]); + __sgx_sanitize_pages(&sgx_dirty_page_list); + __sgx_sanitize_pages(&sgx_dirty_page_list); - /* Should never happen. */ - if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) - WARN(1, "EPC section %d has unsanitized pages.\n", i); - } + /* sanity check: */ + WARN_ON(!list_empty(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -637,13 +634,12 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, section->phys_addr = phys_addr; spin_lock_init(§ion->lock); INIT_LIST_HEAD(§ion->page_list); - INIT_LIST_HEAD(§ion->init_laundry_list); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; - list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } section->free_cnt = nr_pages; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5fa42d143feb..bc8af0428640 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -45,13 +45,6 @@ struct sgx_epc_section { spinlock_t lock; struct list_head page_list; unsigned long free_cnt; - - /* - * Pages which need EREMOVE run on them before they can be - * used. Only safe to be accessed in ksgxd and init code. - * Not protected by locks. - */ - struct list_head init_laundry_list; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; -- Gitee From 174b9d89c2606874f5d7a8cfa3f5e0e818347a4d Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 18 Mar 2021 01:53:31 +0200 Subject: [PATCH 018/134] x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page() commit 901ddbb9ecf5425183ea0c09d10c2fd7868dce54 upstream. Background ========== SGX enclave memory is enumerated by the processor in contiguous physical ranges called Enclave Page Cache (EPC) sections. Currently, there is a free list per section, but allocations simply target the lowest-numbered sections. This is functional, but has no NUMA awareness. Fortunately, EPC sections are covered by entries in the ACPI SRAT table. These entries allow each EPC section to be associated with a NUMA node, just like normal RAM. Solution ======== Implement a NUMA-aware enclave page allocator. Mirror the buddy allocator and maintain a list of enclave pages for each NUMA node. Attempt to allocate enclave memory first from local nodes, then fall back to other nodes. Note that the fallback is not as sophisticated as the buddy allocator and is itself not aware of NUMA distances. When a node's free list is empty, it searches for the next-highest node with enclave pages (and will wrap if necessary). This could be improved in the future. Other ===== NUMA_KEEP_MEMINFO dependency is required for phys_to_target_node(). [ Kai Huang: Do not return NULL from __sgx_alloc_epc_page() because callers do not expect that and that leads to a NULL ptr deref. ] [ dhansen: Fix an uninitialized 'nid' variable in __sgx_alloc_epc_page() as Reported-by: kernel test robot to avoid any potential allocations from the wrong NUMA node or even premature allocation failures. ] Intel-SIG: commit 901ddbb9ecf5 x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page() Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lore.kernel.org/lkml/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com/ Link: https://lkml.kernel.org/r/20210319040602.178558-1-kai.huang@intel.com Link: https://lkml.kernel.org/r/20210318214933.29341-1-dave.hansen@intel.com Link: https://lkml.kernel.org/r/20210317235332.362001-2-jarkko.sakkinen@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/sgx/main.c | 119 +++++++++++++++++++++------------ arch/x86/kernel/cpu/sgx/sgx.h | 16 +++-- 3 files changed, 88 insertions(+), 48 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb1cc0ecbd1a..9c069f5da31c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2016,6 +2016,7 @@ config X86_SGX depends on CRYPTO_SHA256=y select SRCU select MMU_NOTIFIER + select NUMA_KEEP_MEMINFO if NUMA help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 415244a960cf..3e03d533231c 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -23,9 +23,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); - static DEFINE_SPINLOCK(sgx_reclaimer_lock); +/* The free page list lock protected variables prepend the lock. */ +static unsigned long sgx_nr_free_pages; + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + static LIST_HEAD(sgx_dirty_page_list); /* @@ -312,6 +324,7 @@ static void sgx_reclaim_pages(void) struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; + struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -383,28 +396,18 @@ static void sgx_reclaim_pages(void) epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; section = &sgx_epc_sections[epc_page->section]; - spin_lock(§ion->lock); - list_add_tail(&epc_page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); - } -} - -static unsigned long sgx_nr_free_pages(void) -{ - unsigned long cnt = 0; - int i; - - for (i = 0; i < sgx_nr_epc_sections; i++) - cnt += sgx_epc_sections[i].free_cnt; + node = section->node; - return cnt; + spin_lock(&node->lock); + list_add_tail(&epc_page->list, &node->free_page_list); + sgx_nr_free_pages++; + spin_unlock(&node->lock); + } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages() < watermark && - !list_empty(&sgx_active_page_list); + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -451,45 +454,56 @@ static bool __init sgx_page_reclaimer_init(void) return true; } -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { - struct sgx_epc_page *page; + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; - spin_lock(§ion->lock); + spin_lock(&node->lock); - if (list_empty(§ion->page_list)) { - spin_unlock(§ion->lock); + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); return NULL; } - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - section->free_cnt--; + sgx_nr_free_pages--; + + spin_unlock(&node->lock); - spin_unlock(§ion->lock); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * - * Iterate through EPC sections and borrow a free EPC page to the caller. When a - * page is no longer needed it must be released with sgx_free_epc_page(). + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. * * Return: - * an EPC page, - * -errno on error + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { - struct sgx_epc_section *section; struct sgx_epc_page *page; - int i; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; - for (i = 0; i < sgx_nr_epc_sections; i++) { - section = &sgx_epc_sections[i]; + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; - page = __sgx_alloc_epc_page_from_section(section); + page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; } @@ -600,6 +614,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; int ret; WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); @@ -608,10 +623,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page) if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) return; - spin_lock(§ion->lock); - list_add_tail(&page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + spin_lock(&node->lock); + + list_add_tail(&page->list, &node->free_page_list); + sgx_nr_free_pages++; + + spin_unlock(&node->lock); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -632,8 +649,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; - spin_lock_init(§ion->lock); - INIT_LIST_HEAD(§ion->page_list); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; @@ -642,7 +657,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - section->free_cnt = nr_pages; + sgx_nr_free_pages += nr_pages; return true; } @@ -661,8 +676,13 @@ static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; + int nid; int i; + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); @@ -685,6 +705,21 @@ static bool __init sgx_page_cache_init(void) break; } + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + node_set(nid, sgx_numa_mask); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_nr_epc_sections++; } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index bc8af0428640..653af8ca1a25 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -29,22 +29,26 @@ struct sgx_epc_page { struct list_head list; }; +/* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + spinlock_t lock; +}; + /* * The firmware can define multiple chunks of EPC to the different areas of the * physical memory e.g. for memory areas of the each node. This structure is * used to store EPC pages for one EPC section and virtual memory area where * the pages have been mapped. - * - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. */ struct sgx_epc_section { unsigned long phys_addr; void *virt_addr; struct sgx_epc_page *pages; - - spinlock_t lock; - struct list_head page_list; - unsigned long free_cnt; + struct sgx_numa_node *node; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; -- Gitee From f564d328ffa3099a1106a34accc56f6532a55744 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 24 Mar 2021 11:22:46 -0700 Subject: [PATCH 019/134] x86/sgx: Remove unnecessary kmap() from sgx_ioc_enclave_init() commit 633b0616cfe085679471a4c0fae02e8c3a1a9866 upstream. kmap() is inefficient and is being replaced by kmap_local_page(), if possible. There is no readily apparent reason why initp_page needs to be allocated and kmap'ed() except that 'sigstruct' needs to be page-aligned and 'token' 512 byte-aligned. Rather than change it to kmap_local_page(), use kmalloc() instead because kmalloc() can give this alignment when allocating PAGE_SIZE bytes. Remove the alloc_page()/kmap() and replace with kmalloc(PAGE_SIZE, ...) to get a page aligned kernel address. In addition, add a comment to document the alignment requirements so that others don't attempt to 'fix' this again. [ bp: Massage commit message. ] Intel-SIG: commit 633b0616cfe0 x86/sgx: Remove unnecessary kmap() from sgx_ioc_enclave_init() Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Ira Weiny Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210324182246.2484875-1-ira.weiny@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/ioctl.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 9228d8c7660f..8f7b427da4f6 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) { struct sgx_sigstruct *sigstruct; struct sgx_enclave_init init_arg; - struct page *initp_page; void *token; int ret; @@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&init_arg, arg, sizeof(init_arg))) return -EFAULT; - initp_page = alloc_page(GFP_KERNEL); - if (!initp_page) + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) return -ENOMEM; - sigstruct = kmap(initp_page); token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); @@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) ret = sgx_encl_init(encl, sigstruct, token); out: - kunmap(initp_page); - __free_page(initp_page); + kfree(sigstruct); return ret; } -- Gitee From e4600745f785b23a555a3c902edc005e4c20342a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 18 Mar 2021 12:43:01 -0700 Subject: [PATCH 020/134] selftests/sgx: Improve error detection and messages commit 4284f7acb78bfb0e0c26a2b78e2b2c3d68fccd6f upstream. The SGX device file (/dev/sgx_enclave) is unusual in that it requires execute permissions. It has to be both "chmod +x" *and* be on a filesystem without 'noexec'. In the future, udev and systemd should get updates to set up systems automatically. But, for now, nobody's systems do this automatically, and everybody gets error messages like this when running ./test_sgx: 0x0000000000000000 0x0000000000002000 0x03 0x0000000000002000 0x0000000000001000 0x05 0x0000000000003000 0x0000000000003000 0x03 mmap() failed, errno=1. That isn't very user friendly, even for forgetful kernel developers. Further, the test case is rather haphazard about its use of fprintf() versus perror(). Improve the error messages. Use perror() where possible. Lastly, do some sanity checks on opening and mmap()ing the device file so that we can get a decent error message out to the user. Now, if your user doesn't have permission, you'll get the following: $ ls -l /dev/sgx_enclave crw------- 1 root root 10, 126 Mar 18 11:29 /dev/sgx_enclave $ ./test_sgx Unable to open /dev/sgx_enclave: Permission denied If you then 'chown dave:dave /dev/sgx_enclave' (or whatever), but you leave execute permissions off, you'll get: $ ls -l /dev/sgx_enclave crw------- 1 dave dave 10, 126 Mar 18 11:29 /dev/sgx_enclave $ ./test_sgx no execute permissions on device file If you fix that with "chmod ug+x /dev/sgx" but you leave /dev as noexec, you'll get this: $ mount | grep "/dev .*noexec" udev on /dev type devtmpfs (rw,nosuid,noexec,...) $ ./test_sgx ERROR: mmap for exec: Operation not permitted mmap() succeeded for PROT_READ, but failed for PROT_EXEC check that user has execute permissions on /dev/sgx_enclave and that /dev does not have noexec set: 'mount | grep "/dev .*noexec"' That can be fixed with: mount -o remount,noexec /devESC Hopefully, the combination of better error messages and the search engines indexing this message will help people fix their systems until we do this properly. [ bp: Improve error messages more. ] Intel-SIG: commit 4284f7acb78b selftests/sgx: Improve error detection and messages Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20210318194301.11D9A984@viggo.jf.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 69 ++++++++++++++++++++++++------ tools/testing/selftests/sgx/main.c | 2 +- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 9d43b75aaa55..f441ac34b4d4 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -45,19 +45,19 @@ static bool encl_map_bin(const char *path, struct encl *encl) fd = open(path, O_RDONLY); if (fd == -1) { - perror("open()"); + perror("enclave executable open()"); return false; } ret = stat(path, &sb); if (ret) { - perror("stat()"); + perror("enclave executable stat()"); goto err; } bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (bin == MAP_FAILED) { - perror("mmap()"); + perror("enclave executable mmap()"); goto err; } @@ -90,8 +90,7 @@ static bool encl_ioc_create(struct encl *encl) ioc.src = (unsigned long)secs; rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc); if (rc) { - fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n", - errno); + perror("SGX_IOC_ENCLAVE_CREATE failed"); munmap((void *)secs->base, encl->encl_size); return false; } @@ -116,31 +115,72 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc); if (rc < 0) { - fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n", - errno); + perror("SGX_IOC_ENCLAVE_ADD_PAGES failed"); return false; } return true; } + + bool encl_load(const char *path, struct encl *encl) { + const char device_path[] = "/dev/sgx_enclave"; Elf64_Phdr *phdr_tbl; off_t src_offset; Elf64_Ehdr *ehdr; + struct stat sb; + void *ptr; int i, j; int ret; + int fd = -1; memset(encl, 0, sizeof(*encl)); - ret = open("/dev/sgx_enclave", O_RDWR); - if (ret < 0) { - fprintf(stderr, "Unable to open /dev/sgx_enclave\n"); + fd = open(device_path, O_RDWR); + if (fd < 0) { + perror("Unable to open /dev/sgx_enclave"); + goto err; + } + + ret = stat(device_path, &sb); + if (ret) { + perror("device file stat()"); + goto err; + } + + /* + * This just checks if the /dev file has these permission + * bits set. It does not check that the current user is + * the owner or in the owning group. + */ + if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { + fprintf(stderr, "no execute permissions on device file %s\n", device_path); + goto err; + } + + ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (ptr == (void *)-1) { + perror("mmap for read"); + goto err; + } + munmap(ptr, PAGE_SIZE); + +#define ERR_MSG \ +"mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \ +" Check that current user has execute permissions on %s and \n" \ +" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \ +" If so, remount it executable: mount -o remount,exec /dev\n\n" + + ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0); + if (ptr == (void *)-1) { + fprintf(stderr, ERR_MSG, device_path); goto err; } + munmap(ptr, PAGE_SIZE); - encl->fd = ret; + encl->fd = fd; if (!encl_map_bin(path, encl)) goto err; @@ -217,6 +257,8 @@ bool encl_load(const char *path, struct encl *encl) return true; err: + if (fd != -1) + close(fd); encl_delete(encl); return false; } @@ -229,7 +271,7 @@ static bool encl_map_area(struct encl *encl) area = mmap(NULL, encl_size * 2, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (area == MAP_FAILED) { - perror("mmap"); + perror("reservation mmap()"); return false; } @@ -268,8 +310,7 @@ bool encl_build(struct encl *encl) ioc.sigstruct = (uint64_t)&encl->sigstruct; ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc); if (ret) { - fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n", - errno); + perror("SGX_IOC_ENCLAVE_INIT failed"); return false; } diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 724cec700926..b117bb86a73f 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -195,7 +195,7 @@ int main(int argc, char *argv[], char *envp[]) addr = mmap((void *)encl.encl_base + seg->offset, seg->size, seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); if (addr == MAP_FAILED) { - fprintf(stderr, "mmap() failed, errno=%d.\n", errno); + perror("mmap() segment failed"); exit(KSFT_FAIL); } } -- Gitee From abf650d13bc30a46460060dc522738c12975351c Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Sun, 14 Mar 2021 19:16:21 +0800 Subject: [PATCH 021/134] selftests/sgx: Use getauxval() to simplify test code commit f33dece70e11ce82a09cb1ea2d7c32347b82c67e upstream. Use the library function getauxval() instead of a custom function to get the base address of the vDSO. [ bp: Massage commit message. ] Intel-SIG: commit f33dece70e11 selftests/sgx: Use getauxval() to simplify test code Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Tianjia Zhang Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Acked-by: Shuah Khan Link: https://lkml.kernel.org/r/20210314111621.68428-1-tianjia.zhang@linux.alibaba.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index b117bb86a73f..d304a4044eb9 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "defines.h" #include "main.h" #include "../kselftest.h" @@ -28,24 +29,6 @@ struct vdso_symtab { Elf64_Word *elf_hashtab; }; -static void *vdso_get_base_addr(char *envp[]) -{ - Elf64_auxv_t *auxv; - int i; - - for (i = 0; envp[i]; i++) - ; - - auxv = (Elf64_auxv_t *)&envp[i + 1]; - - for (i = 0; auxv[i].a_type != AT_NULL; i++) { - if (auxv[i].a_type == AT_SYSINFO_EHDR) - return (void *)auxv[i].a_un.a_val; - } - - return NULL; -} - static Elf64_Dyn *vdso_get_dyntab(void *addr) { Elf64_Ehdr *ehdr = addr; @@ -162,7 +145,7 @@ static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r return 0; } -int main(int argc, char *argv[], char *envp[]) +int main(int argc, char *argv[]) { struct sgx_enclave_run run; struct vdso_symtab symtab; @@ -203,7 +186,8 @@ int main(int argc, char *argv[], char *envp[]) memset(&run, 0, sizeof(run)); run.tcs = encl.encl_base; - addr = vdso_get_base_addr(envp); + /* Get vDSO base address */ + addr = (void *)getauxval(AT_SYSINFO_EHDR); if (!addr) goto err; -- Gitee From d89d82851c8d5f4a7f49338620f96dab4a2a8193 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:03 +1300 Subject: [PATCH 022/134] x86/sgx: Expose SGX architectural definitions to the kernel commit 8ca52cc38dc8fdcbdbd0c23eafb19db5e5f5c8d0 upstream. Expose SGX architectural structures, as KVM will use many of the architectural constants and structs to virtualize SGX. Name the new header file as asm/sgx.h, rather than asm/sgx_arch.h, to have single header to provide SGX facilities to share with other kernel componments. Also update MAINTAINERS to include asm/sgx.h. Intel-SIG: commit 8ca52cc38dc8 x86/sgx: Expose SGX architectural definitions to the kernel Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/6bf47acd91ab4d709e66ad1692c7803e4c9063a0.1616136308.git.kai.huang@intel.com [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- MAINTAINERS | 1 + .../cpu/sgx/arch.h => include/asm/sgx.h} | 20 ++++++++++++++----- arch/x86/kernel/cpu/sgx/encl.c | 2 +- arch/x86/kernel/cpu/sgx/sgx.h | 2 +- tools/testing/selftests/sgx/defines.h | 2 +- 5 files changed, 19 insertions(+), 8 deletions(-) rename arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx.h} (95%) diff --git a/MAINTAINERS b/MAINTAINERS index e0a5132a9487..ae4bc7436317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8548,6 +8548,7 @@ Q: https://patchwork.kernel.org/project/intel-sgx/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-sgx.git F: Documentation/x86/sgx.rst F: arch/x86/entry/vdso/vsgx.S +F: arch/x86/include/asm/sgx.h F: arch/x86/include/uapi/asm/sgx.h F: arch/x86/kernel/cpu/sgx/* F: tools/testing/selftests/sgx/* diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h similarity index 95% rename from arch/x86/kernel/cpu/sgx/arch.h rename to arch/x86/include/asm/sgx.h index dd7602c44c72..d96e5451d28a 100644 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ b/arch/x86/include/asm/sgx.h @@ -2,15 +2,20 @@ /** * Copyright(c) 2016-20 Intel Corporation. * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. + * Intel Software Guard Extensions (SGX) support. */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H +#ifndef _ASM_X86_SGX_H +#define _ASM_X86_SGX_H #include #include +/* + * This file contains both data structures defined by SGX architecture and Linux + * defined software data structures and functions. The two should not be mixed + * together for better readibility. The architectural definitions come first. + */ + /* The SGX specific CPUID function. */ #define SGX_CPUID 0x12 /* EPC enumeration. */ @@ -335,4 +340,9 @@ struct sgx_sigstruct { #define SGX_LAUNCH_TOKEN_SIZE 304 -#endif /* _ASM_X86_SGX_ARCH_H */ +/* + * Do not put any hardware-defined SGX structure representations below this + * comment! + */ + +#endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 332977248ae0..759ec02f59e8 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include #include #include -#include "arch.h" +#include #include "encl.h" #include "encls.h" #include "sgx.h" diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 653af8ca1a25..2a2b5c857451 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,7 +8,7 @@ #include #include #include -#include "arch.h" +#include #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 592c1ccf4576..0bd73428d2f3 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -14,7 +14,7 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __packed __attribute__((packed)) -#include "../../../../arch/x86/kernel/cpu/sgx/arch.h" +#include "../../../../arch/x86/include/asm/sgx.h" #include "../../../../arch/x86/include/asm/enclu.h" #include "../../../../arch/x86/include/uapi/asm/sgx.h" -- Gitee From f3a2757a1825faf5168ca2a1edb16dc8b0905a0c Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 Jun 2021 11:30:17 +0300 Subject: [PATCH 023/134] selftests/sgx: Rename 'eenter' and 'sgx_call_vdso' commit 6a7171b8a0f8e961744d0c46fb7547662a3fca36 upstream. Rename symbols for better clarity: * 'eenter' might be confused for directly calling ENCLU[EENTER]. It does not. It calls into the VDSO, which actually has the EENTER instruction. * 'sgx_call_vdso' is *only* used for entering the enclave. It's not some generic SGX call into the VDSO. Intel-SIG: commit 6a7171b8a0f8 selftests/sgx: Rename 'eenter' and 'sgx_call_vdso' Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/call.S | 6 +++--- tools/testing/selftests/sgx/main.c | 25 +++++++++++++------------ tools/testing/selftests/sgx/main.h | 4 ++-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/sgx/call.S b/tools/testing/selftests/sgx/call.S index 4ecadc7490f4..b09a25890f3b 100644 --- a/tools/testing/selftests/sgx/call.S +++ b/tools/testing/selftests/sgx/call.S @@ -5,8 +5,8 @@ .text - .global sgx_call_vdso -sgx_call_vdso: + .global sgx_enter_enclave +sgx_enter_enclave: .cfi_startproc push %r15 .cfi_adjust_cfa_offset 8 @@ -27,7 +27,7 @@ sgx_call_vdso: .cfi_adjust_cfa_offset 8 push 0x38(%rsp) .cfi_adjust_cfa_offset 8 - call *eenter(%rip) + call *vdso_sgx_enter_enclave(%rip) add $0x10, %rsp .cfi_adjust_cfa_offset -0x10 pop %rbx diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index d304a4044eb9..43da68388e25 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -21,7 +21,7 @@ #include "../kselftest.h" static const uint64_t MAGIC = 0x1122334455667788ULL; -vdso_sgx_enter_enclave_t eenter; +vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; struct vdso_symtab { Elf64_Sym *elf_symtab; @@ -149,7 +149,7 @@ int main(int argc, char *argv[]) { struct sgx_enclave_run run; struct vdso_symtab symtab; - Elf64_Sym *eenter_sym; + Elf64_Sym *sgx_enter_enclave_sym; uint64_t result = 0; struct encl encl; unsigned int i; @@ -194,29 +194,30 @@ int main(int argc, char *argv[]) if (!vdso_get_symtab(addr, &symtab)) goto err; - eenter_sym = vdso_symtab_get(&symtab, "__vdso_sgx_enter_enclave"); - if (!eenter_sym) + sgx_enter_enclave_sym = vdso_symtab_get(&symtab, "__vdso_sgx_enter_enclave"); + if (!sgx_enter_enclave_sym) goto err; - eenter = addr + eenter_sym->st_value; + vdso_sgx_enter_enclave = addr + sgx_enter_enclave_sym->st_value; - ret = sgx_call_vdso((void *)&MAGIC, &result, 0, EENTER, NULL, NULL, &run); - if (!report_results(&run, ret, result, "sgx_call_vdso")) + ret = sgx_enter_enclave((void *)&MAGIC, &result, 0, EENTER, + NULL, NULL, &run); + if (!report_results(&run, ret, result, "sgx_enter_enclave_unclobbered")) goto err; /* Invoke the vDSO directly. */ result = 0; - ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, - 0, 0, &run); - if (!report_results(&run, ret, result, "eenter")) + ret = vdso_sgx_enter_enclave((unsigned long)&MAGIC, (unsigned long)&result, + 0, EENTER, 0, 0, &run); + if (!report_results(&run, ret, result, "sgx_enter_enclave")) goto err; /* And with an exit handler. */ run.user_handler = (__u64)user_handler; run.user_data = 0xdeadbeef; - ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, - 0, 0, &run); + ret = vdso_sgx_enter_enclave((unsigned long)&MAGIC, (unsigned long)&result, + 0, EENTER, 0, 0, &run); if (!report_results(&run, ret, result, "user_handler")) goto err; diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index 67211a708f04..68672fd86cf9 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -35,7 +35,7 @@ bool encl_load(const char *path, struct encl *encl); bool encl_measure(struct encl *encl); bool encl_build(struct encl *encl); -int sgx_call_vdso(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, - struct sgx_enclave_run *run); +int sgx_enter_enclave(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, + struct sgx_enclave_run *run); #endif /* MAIN_H */ -- Gitee From 022fafcbf50037a4d1ddebf0b2137bacbb00d599 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 Jun 2021 11:30:18 +0300 Subject: [PATCH 024/134] selftests/sgx: Migrate to kselftest harness commit 235d1c9c63088c33d746a1e7e92e15153b8d1192 upstream. Migrate to kselftest harness. Use a fixture test with enclave initialized and de-initialized for each of the existing three tests, in other words: 1. One FIXTURE() for managing the enclave life-cycle. 2. Three TEST_F()'s, one for each test case. Dump lines of /proc/self/maps matching "sgx" in FIXTURE_SETUP() as this can be very useful debugging information later on. Amended commit log: This migration changes the output of this test. Instead of skipping the tests if open /dev/sgx_enclave fails, it will run all the tests and report failures on all of them. Shuah Khan Intel-SIG: commit 235d1c9c6308 selftests/sgx: Migrate to kselftest harness Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 3 - tools/testing/selftests/sgx/main.c | 177 +++++++++++++++-------------- 2 files changed, 92 insertions(+), 88 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index f441ac34b4d4..00928be57fc4 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -239,9 +239,6 @@ bool encl_load(const char *path, struct encl *encl) seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset; seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK; - printf("0x%016lx 0x%016lx 0x%02x\n", seg->offset, seg->size, - seg->prot); - j++; } diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 43da68388e25..6da19b6bf287 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -17,8 +17,8 @@ #include #include #include "defines.h" +#include "../kselftest_harness.h" #include "main.h" -#include "../kselftest.h" static const uint64_t MAGIC = 0x1122334455667788ULL; vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; @@ -107,85 +107,49 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) return NULL; } -bool report_results(struct sgx_enclave_run *run, int ret, uint64_t result, - const char *test) -{ - bool valid = true; - - if (ret) { - printf("FAIL: %s() returned: %d\n", test, ret); - valid = false; - } - - if (run->function != EEXIT) { - printf("FAIL: %s() function, expected: %u, got: %u\n", test, EEXIT, - run->function); - valid = false; - } - - if (result != MAGIC) { - printf("FAIL: %s(), expected: 0x%lx, got: 0x%lx\n", test, MAGIC, - result); - valid = false; - } - - if (run->user_data) { - printf("FAIL: %s() user data, expected: 0x0, got: 0x%llx\n", - test, run->user_data); - valid = false; - } - - return valid; -} - -static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r9, - struct sgx_enclave_run *run) -{ - run->user_data = 0; - return 0; -} +FIXTURE(enclave) { + struct encl encl; + struct sgx_enclave_run run; +}; -int main(int argc, char *argv[]) +FIXTURE_SETUP(enclave) { - struct sgx_enclave_run run; + Elf64_Sym *sgx_enter_enclave_sym = NULL; struct vdso_symtab symtab; - Elf64_Sym *sgx_enter_enclave_sym; - uint64_t result = 0; - struct encl encl; + struct encl_segment *seg; unsigned int i; void *addr; - int ret; - - memset(&run, 0, sizeof(run)); - if (!encl_load("test_encl.elf", &encl)) { - encl_delete(&encl); + if (!encl_load("test_encl.elf", &self->encl)) { + encl_delete(&self->encl); ksft_exit_skip("cannot load enclaves\n"); } - if (!encl_measure(&encl)) + for (i = 0; i < self->encl.nr_segments; i++) { + seg = &self->encl.segment_tbl[i]; + + TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot); + } + + if (!encl_measure(&self->encl)) goto err; - if (!encl_build(&encl)) + if (!encl_build(&self->encl)) goto err; /* * An enclave consumer only must do this. */ - for (i = 0; i < encl.nr_segments; i++) { - struct encl_segment *seg = &encl.segment_tbl[i]; - - addr = mmap((void *)encl.encl_base + seg->offset, seg->size, - seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); - if (addr == MAP_FAILED) { - perror("mmap() segment failed"); - exit(KSFT_FAIL); - } + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + addr = mmap((void *)self->encl.encl_base + seg->offset, seg->size, + seg->prot, MAP_SHARED | MAP_FIXED, self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + if (addr == MAP_FAILED) + goto err; } - memset(&run, 0, sizeof(run)); - run.tcs = encl.encl_base; - /* Get vDSO base address */ addr = (void *)getauxval(AT_SYSINFO_EHDR); if (!addr) @@ -200,32 +164,75 @@ int main(int argc, char *argv[]) vdso_sgx_enter_enclave = addr + sgx_enter_enclave_sym->st_value; - ret = sgx_enter_enclave((void *)&MAGIC, &result, 0, EENTER, - NULL, NULL, &run); - if (!report_results(&run, ret, result, "sgx_enter_enclave_unclobbered")) - goto err; + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; +err: + if (!sgx_enter_enclave_sym) + encl_delete(&self->encl); - /* Invoke the vDSO directly. */ - result = 0; - ret = vdso_sgx_enter_enclave((unsigned long)&MAGIC, (unsigned long)&result, - 0, EENTER, 0, 0, &run); - if (!report_results(&run, ret, result, "sgx_enter_enclave")) - goto err; + ASSERT_NE(sgx_enter_enclave_sym, NULL); +} - /* And with an exit handler. */ - run.user_handler = (__u64)user_handler; - run.user_data = 0xdeadbeef; - ret = vdso_sgx_enter_enclave((unsigned long)&MAGIC, (unsigned long)&result, - 0, EENTER, 0, 0, &run); - if (!report_results(&run, ret, result, "user_handler")) - goto err; +FIXTURE_TEARDOWN(enclave) +{ + encl_delete(&self->encl); +} - printf("SUCCESS\n"); - encl_delete(&encl); - exit(KSFT_PASS); +#define ENCL_CALL(in, out, run, clobbered) \ + ({ \ + int ret; \ + if ((clobbered)) \ + ret = vdso_sgx_enter_enclave((unsigned long)(in), (unsigned long)(out), 0, \ + EENTER, 0, 0, (run)); \ + else \ + ret = sgx_enter_enclave((void *)(in), (void *)(out), 0, EENTER, NULL, NULL, \ + (run)); \ + ret; \ + }) + +TEST_F(enclave, unclobbered_vdso) +{ + uint64_t result = 0; -err: - encl_delete(&encl); - exit(KSFT_FAIL); + EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, false), 0); + + EXPECT_EQ(result, MAGIC); + EXPECT_EQ(self->run.function, EEXIT); + EXPECT_EQ(self->run.user_data, 0); +} + +TEST_F(enclave, clobbered_vdso) +{ + uint64_t result = 0; + + EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, true), 0); + + EXPECT_EQ(result, MAGIC); + EXPECT_EQ(self->run.function, EEXIT); + EXPECT_EQ(self->run.user_data, 0); } + +static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r9, + struct sgx_enclave_run *run) +{ + run->user_data = 0; + + return 0; +} + +TEST_F(enclave, clobbered_vdso_and_user_function) +{ + uint64_t result = 0; + + self->run.user_handler = (__u64)test_handler; + self->run.user_data = 0xdeadbeef; + + EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, true), 0); + + EXPECT_EQ(result, MAGIC); + EXPECT_EQ(self->run.function, EEXIT); + EXPECT_EQ(self->run.user_data, 0); +} + +TEST_HARNESS_MAIN -- Gitee From 46d50f3e33e58b28837520d946c6d1c31d1079cd Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 Jun 2021 11:30:19 +0300 Subject: [PATCH 025/134] selftests/sgx: Dump enclave memory map commit 040efd1c35f93787cbd26be6fc6493592571f424 upstream. Often, it's useful to check whether /proc/self/maps looks sane when dealing with memory mapped objects, especially when they are JIT'ish dynamically constructed objects. Therefore, dump "/dev/sgx_enclave" matching lines from the memory map in FIXTURE_SETUP(). Intel-SIG: commit 040efd1c35f9 selftests/sgx: Dump enclave memory map Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 6da19b6bf287..14030f8b85ff 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -117,6 +117,8 @@ FIXTURE_SETUP(enclave) Elf64_Sym *sgx_enter_enclave_sym = NULL; struct vdso_symtab symtab; struct encl_segment *seg; + char maps_line[256]; + FILE *maps_file; unsigned int i; void *addr; @@ -167,6 +169,18 @@ FIXTURE_SETUP(enclave) memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; + maps_file = fopen("/proc/self/maps", "r"); + if (maps_file != NULL) { + while (fgets(maps_line, sizeof(maps_line), maps_file) != NULL) { + maps_line[strlen(maps_line) - 1] = '\0'; + + if (strstr(maps_line, "/dev/sgx_enclave")) + TH_LOG("%s", maps_line); + } + + fclose(maps_file); + } + err: if (!sgx_enter_enclave_sym) encl_delete(&self->encl); -- Gitee From 45ce230acb848c1135a337a57efb38ac3ad85adc Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 Jun 2021 11:30:20 +0300 Subject: [PATCH 026/134] selftests/sgx: Add EXPECT_EEXIT() macro commit b334fb6fa7f38b4ad188d38307aea45e827b56ce upstream. Add EXPECT_EEXIT() macro, which will conditionally print the exception information, in addition to EXPECT_EQ(self->run.function, EEXIT); Intel-SIG: commit b334fb6fa7f3 selftests/sgx: Add EXPECT_EEXIT() macro Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 14030f8b85ff..bcd0257f48e0 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -205,6 +205,14 @@ FIXTURE_TEARDOWN(enclave) ret; \ }) +#define EXPECT_EEXIT(run) \ + do { \ + EXPECT_EQ((run)->function, EEXIT); \ + if ((run)->function != EEXIT) \ + TH_LOG("0x%02x 0x%02x 0x%016llx", (run)->exception_vector, \ + (run)->exception_error_code, (run)->exception_addr); \ + } while (0) + TEST_F(enclave, unclobbered_vdso) { uint64_t result = 0; @@ -212,7 +220,7 @@ TEST_F(enclave, unclobbered_vdso) EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, false), 0); EXPECT_EQ(result, MAGIC); - EXPECT_EQ(self->run.function, EEXIT); + EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } @@ -223,7 +231,7 @@ TEST_F(enclave, clobbered_vdso) EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, true), 0); EXPECT_EQ(result, MAGIC); - EXPECT_EQ(self->run.function, EEXIT); + EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } @@ -245,7 +253,7 @@ TEST_F(enclave, clobbered_vdso_and_user_function) EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, true), 0); EXPECT_EQ(result, MAGIC); - EXPECT_EQ(self->run.function, EEXIT); + EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } -- Gitee From 1355710ff9aba93cfa3584df481db26e20ef58d5 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 Jun 2021 11:30:21 +0300 Subject: [PATCH 027/134] selftests/sgx: Refine the test enclave to have storage commit 22118ce17eb8dcf2a6ba2f6fb250816ddb59685a upstream. Extend the enclave to have two operations: ENCL_OP_PUT and ENCL_OP_GET. ENCL_OP_PUT stores value inside the enclave address space and ENCL_OP_GET reads it. The internal buffer can be later extended to be variable size, and allow reclaimer tests. Intel-SIG: commit 22118ce17eb8 selftests/sgx: Refine the test enclave to have storage Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 10 ++++ tools/testing/selftests/sgx/main.c | 57 ++++++++++++++++++----- tools/testing/selftests/sgx/test_encl.c | 19 +++++++- tools/testing/selftests/sgx/test_encl.lds | 3 +- 4 files changed, 74 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 0bd73428d2f3..f88562afcaa0 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -18,4 +18,14 @@ #include "../../../../arch/x86/include/asm/enclu.h" #include "../../../../arch/x86/include/uapi/asm/sgx.h" +enum encl_op_type { + ENCL_OP_PUT, + ENCL_OP_GET, +}; + +struct encl_op { + uint64_t type; + uint64_t buffer; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index bcd0257f48e0..e252015e0c15 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -193,14 +193,14 @@ FIXTURE_TEARDOWN(enclave) encl_delete(&self->encl); } -#define ENCL_CALL(in, out, run, clobbered) \ +#define ENCL_CALL(op, run, clobbered) \ ({ \ int ret; \ if ((clobbered)) \ - ret = vdso_sgx_enter_enclave((unsigned long)(in), (unsigned long)(out), 0, \ + ret = vdso_sgx_enter_enclave((unsigned long)(op), 0, 0, \ EENTER, 0, 0, (run)); \ else \ - ret = sgx_enter_enclave((void *)(in), (void *)(out), 0, EENTER, NULL, NULL, \ + ret = sgx_enter_enclave((void *)(op), NULL, 0, EENTER, NULL, NULL, \ (run)); \ ret; \ }) @@ -215,22 +215,44 @@ FIXTURE_TEARDOWN(enclave) TEST_F(enclave, unclobbered_vdso) { - uint64_t result = 0; + struct encl_op op; - EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, false), 0); + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); - EXPECT_EQ(result, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + + EXPECT_EQ(op.buffer, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } TEST_F(enclave, clobbered_vdso) { - uint64_t result = 0; + struct encl_op op; + + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; - EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, true), 0); + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); - EXPECT_EQ(result, MAGIC); + EXPECT_EQ(op.buffer, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } @@ -245,14 +267,25 @@ static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r TEST_F(enclave, clobbered_vdso_and_user_function) { - uint64_t result = 0; + struct encl_op op; self->run.user_handler = (__u64)test_handler; self->run.user_data = 0xdeadbeef; - EXPECT_EQ(ENCL_CALL(&MAGIC, &result, &self->run, true), 0); + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); - EXPECT_EQ(result, MAGIC); + EXPECT_EQ(op.buffer, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index cf25b5dc1e03..734ea52f9924 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -4,6 +4,8 @@ #include #include "defines.h" +static uint8_t encl_buffer[8192] = { 1 }; + static void *memcpy(void *dest, const void *src, size_t n) { size_t i; @@ -14,7 +16,20 @@ static void *memcpy(void *dest, const void *src, size_t n) return dest; } -void encl_body(void *rdi, void *rsi) +void encl_body(void *rdi, void *rsi) { - memcpy(rsi, rdi, 8); + struct encl_op *op = (struct encl_op *)rdi; + + switch (op->type) { + case ENCL_OP_PUT: + memcpy(&encl_buffer[0], &op->buffer, 8); + break; + + case ENCL_OP_GET: + memcpy(&op->buffer, &encl_buffer[0], 8); + break; + + default: + break; + } } diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds index 0fbbda7e665e..a1ec64f7d91f 100644 --- a/tools/testing/selftests/sgx/test_encl.lds +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -18,9 +18,10 @@ SECTIONS .text : { *(.text*) *(.rodata*) + FILL(0xDEADBEEF); + . = ALIGN(4096); } : text - . = ALIGN(4096); .data : { *(.data*) } : data -- Gitee From 4b592f1840b12d2cca943a8aecf8452737922105 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 21 Jun 2021 12:05:56 -0700 Subject: [PATCH 028/134] selftests/sgx: remove checks for file execute permissions commit 4896df9d53ae5521f3ce83751e828ad70bc65c80 upstream. The SGX selftests can fail for a bunch of non-obvious reasons like 'noexec' permissions on /dev (which is the default *EVERYWHERE* it seems). A new test mistakenly also looked for +x permission on the /dev/sgx_enclave. File execute permissions really only apply to the ability of execve() to work on a file, *NOT* on the ability for an application to map the file with PROT_EXEC. SGX needs to mmap(PROT_EXEC), but doesn't need to execve() the device file. Remove the check. Intel-SIG: commit 4896df9d53ae selftests/sgx: remove checks for file execute permissions Backport for SGX virtualization support on Intel Xeon platform. Fixes: 4284f7acb78b ("selftests/sgx: Improve error detection and messages") Reported-by: Tim Gardner Cc: Jarkko Sakkinen Cc: Reinette Chatre Cc: Dave Hansen Cc: Shuah Khan Cc: linux-sgx@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: linux-kernel@vger.kernel.org Tested-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 00928be57fc4..3ebe5d1fe337 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -150,16 +150,6 @@ bool encl_load(const char *path, struct encl *encl) goto err; } - /* - * This just checks if the /dev file has these permission - * bits set. It does not check that the current user is - * the owner or in the owning group. - */ - if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { - fprintf(stderr, "no execute permissions on device file %s\n", device_path); - goto err; - } - ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); if (ptr == (void *)-1) { perror("mmap for read"); @@ -169,13 +159,13 @@ bool encl_load(const char *path, struct encl *encl) #define ERR_MSG \ "mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \ -" Check that current user has execute permissions on %s and \n" \ -" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \ +" Check that /dev does not have noexec set:\n" \ +" \tmount | grep \"/dev .*noexec\"\n" \ " If so, remount it executable: mount -o remount,exec /dev\n\n" ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0); if (ptr == (void *)-1) { - fprintf(stderr, ERR_MSG, device_path); + fprintf(stderr, ERR_MSG); goto err; } munmap(ptr, PAGE_SIZE); -- Gitee From 09811ebee8a16d7644951fa01d76442609ca8966 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 5 Jul 2021 08:09:21 +0300 Subject: [PATCH 029/134] selftests/sgx: Fix Q1 and Q2 calculation in sigstruct.c commit 567c39047dbee341244fe3bf79fea24ee0897ff9 upstream. Q1 and Q2 are numbers with *maximum* length of 384 bytes. If the calculated length of Q1 and Q2 is less than 384 bytes, things will go wrong. E.g. if Q2 is 383 bytes, then 1. The bytes of q2 are copied to sigstruct->q2 in calc_q1q2(). 2. The entire sigstruct->q2 is reversed, which results it being 256 * Q2, given that the last byte of sigstruct->q2 is added to before the bytes given by calc_q1q2(). Either change in key or measurement can trigger the bug. E.g. an unmeasured heap could cause a devastating change in Q1 or Q2. Reverse exactly the bytes of Q1 and Q2 in calc_q1q2() before returning to the caller. Intel-SIG: commit 567c39047dbe selftests/sgx: Fix Q1 and Q2 calculation in sigstruct.c Backport for SGX virtualization support on Intel Xeon platform. Fixes: 2adcba79e69d ("selftests/x86: Add a selftest for SGX") Link: https://lore.kernel.org/linux-sgx/20210301051836.30738-1-tianjia.zhang@linux.alibaba.com/ Signed-off-by: Tianjia Zhang Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/sigstruct.c | 41 +++++++++++++------------ 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index dee7a3d6c5a5..92bbc5a15c39 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -55,10 +55,27 @@ static bool alloc_q1q2_ctx(const uint8_t *s, const uint8_t *m, return true; } +static void reverse_bytes(void *data, int length) +{ + int i = 0; + int j = length - 1; + uint8_t temp; + uint8_t *ptr = data; + + while (i < j) { + temp = ptr[i]; + ptr[i] = ptr[j]; + ptr[j] = temp; + i++; + j--; + } +} + static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, uint8_t *q2) { struct q1q2_ctx ctx; + int len; if (!alloc_q1q2_ctx(s, m, &ctx)) { fprintf(stderr, "Not enough memory for Q1Q2 calculation\n"); @@ -89,8 +106,10 @@ static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, goto out; } - BN_bn2bin(ctx.q1, q1); - BN_bn2bin(ctx.q2, q2); + len = BN_bn2bin(ctx.q1, q1); + reverse_bytes(q1, len); + len = BN_bn2bin(ctx.q2, q2); + reverse_bytes(q2, len); free_q1q2_ctx(&ctx); return true; @@ -152,22 +171,6 @@ static RSA *gen_sign_key(void) return key; } -static void reverse_bytes(void *data, int length) -{ - int i = 0; - int j = length - 1; - uint8_t temp; - uint8_t *ptr = data; - - while (i < j) { - temp = ptr[i]; - ptr[i] = ptr[j]; - ptr[j] = temp; - i++; - j--; - } -} - enum mrtags { MRECREATE = 0x0045544145524345, MREADD = 0x0000000044444145, @@ -367,8 +370,6 @@ bool encl_measure(struct encl *encl) /* BE -> LE */ reverse_bytes(sigstruct->signature, SGX_MODULUS_SIZE); reverse_bytes(sigstruct->modulus, SGX_MODULUS_SIZE); - reverse_bytes(sigstruct->q1, SGX_MODULUS_SIZE); - reverse_bytes(sigstruct->q2, SGX_MODULUS_SIZE); EVP_MD_CTX_destroy(ctx); RSA_free(key); -- Gitee From ccc12eef7fee71be4dda281fc380d56a40ec63e6 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 19 Mar 2021 20:22:17 +1300 Subject: [PATCH 030/134] x86/cpufeatures: Make SGX_LC feature bit depend on SGX bit commit e9a15a40e857fc6ccfbb05fec7b184e9003057df upstream. Move SGX_LC feature bit to CPUID dependency table to make clearing all SGX feature bits easier. Also remove clear_sgx_caps() since it is just a wrapper of setup_clear_cpu_cap(X86_FEATURE_SGX) now. Intel-SIG: commit e9a15a40e857 x86/cpufeatures: Make SGX_LC feature bit depend on SGX bit. Backport for SGX virtualization support. Suggested-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/5d4220fd0a39f52af024d3fa166231c1d498dd10.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/cpuid-deps.c | 1 + arch/x86/kernel/cpu/feat_ctl.c | 12 +++--------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 91ae604925f3..144d3977abf1 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index ccec52073f8a..d9c736b0bdcf 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -5,15 +5,9 @@ #include #include -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -26,7 +20,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } @@ -69,6 +63,6 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { if (enable_sgx) pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); } } -- Gitee From b4265b7fa265b1964f09e4577fda1fcf22f2e954 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:18 +1300 Subject: [PATCH 031/134] x86/cpufeatures: Add SGX1 and SGX2 sub-features commit b8921dccf3b25798409d35155b5d127085de72c2 upstream. Add SGX1 and SGX2 feature flags, via CPUID.0x12.0x0.EAX, as scattered features, since adding a new leaf for only two bits would be wasteful. As part of virtualizing SGX, KVM will expose the SGX CPUID leafs to its guest, and to do so correctly needs to query hardware and kernel support for SGX1 and SGX2. Suppress both SGX1 and SGX2 from /proc/cpuinfo. SGX1 basically means SGX, and for SGX2 there is no concrete use case of using it in /proc/cpuinfo. Intel-SIG: commit b8921dccf3b2 x86/cpufeatures: Add SGX1 and SGX2 sub-features. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d787827dbfca6b3210ac3e432e3ac1202727e786.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/cpuid-deps.c | 2 ++ arch/x86/kernel/cpu/scattered.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0778e487c780..9683ca0a39fd 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -288,6 +288,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 144d3977abf1..b4de14be1509 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -73,6 +73,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 5cc4b5d43f45..b8386e59f8d5 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, -- Gitee From 0d9c048cfdd94824d9b99f8df757a6470085e7c8 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 25 Mar 2021 22:30:57 +1300 Subject: [PATCH 032/134] x86/sgx: Wipe out EREMOVE from sgx_free_epc_page() commit b0c7459be0670fabe080e30906ba9fe62df5e02c upstream. EREMOVE takes a page and removes any association between that page and an enclave. It must be run on a page before it can be added into another enclave. Currently, EREMOVE is run as part of pages being freed into the SGX page allocator. It is not expected to fail, as it would indicate a use-after-free of EPC pages. Rather than add the page back to the pool of available EPC pages, the kernel intentionally leaks the page to avoid additional errors in the future. However, KVM does not track how guest pages are used, which means that SGX virtualization use of EREMOVE might fail. Specifically, it is legitimate that EREMOVE returns SGX_CHILD_PRESENT for EPC assigned to KVM guest, because KVM/kernel doesn't track SECS pages. To allow SGX/KVM to introduce a more permissive EREMOVE helper and to let the SGX virtualization code use the allocator directly, break out the EREMOVE call from the SGX page allocator. Rename the original sgx_free_epc_page() to sgx_encl_free_epc_page(), indicating that it is used to free an EPC page assigned to a host enclave. Replace sgx_free_epc_page() with sgx_encl_free_epc_page() in all call sites so there's no functional change. At the same time, improve the error message when EREMOVE fails, and add documentation to explain to the user what that failure means and to suggest to the user what to do when this bug happens in the case it happens. [ bp: Massage commit message, fix typos and sanitize text, simplify. ] Intel-SIG: commit b0c7459be067 x86/sgx: Wipe out EREMOVE from sgx_free_epc_page(). Backport for SGX virtualization support. Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/20210325093057.122834-1-kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 25 +++++++++++++++++++++++++ arch/x86/kernel/cpu/sgx/encl.c | 31 ++++++++++++++++++++++++++----- arch/x86/kernel/cpu/sgx/encl.h | 1 + arch/x86/kernel/cpu/sgx/ioctl.c | 6 +++--- arch/x86/kernel/cpu/sgx/main.c | 14 +++++--------- arch/x86/kernel/cpu/sgx/sgx.h | 4 ++++ 6 files changed, 64 insertions(+), 17 deletions(-) diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index eaee1368b4fd..f90076e67cde 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -209,3 +209,28 @@ An application may be loaded into a container enclave which is specially configured with a library OS and run-time which permits the application to run. The enclave run-time and library OS work together to execute the application when a thread enters the enclave. + +Impact of Potential Kernel SGX Bugs +=================================== + +EPC leaks +--------- + +When EPC page leaks happen, a WARNING like this is shown in dmesg: + +"EREMOVE returned ... and an EPC page was leaked. SGX may become unusable..." + +This is effectively a kernel use-after-free of an EPC page, and due +to the way SGX works, the bug is detected at freeing. Rather than +adding the page back to the pool of available EPC pages, the kernel +intentionally leaks the page to avoid additional errors in the future. + +When this happens, the kernel will likely soon leak more EPC pages, and +SGX will likely become unusable because the memory available to SGX is +limited. However, while this may be fatal to SGX, the rest of the kernel +is unlikely to be impacted and should continue to work. + +As a result, when this happpens, user should stop running any new +SGX workloads, (or just any new workloads), and migrate all valuable +workloads. Although a machine reboot can recover all EPC memory, the bug +should be reported to Linux developers. diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 759ec02f59e8..f61f946755a8 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } @@ -416,7 +416,7 @@ void sgx_encl_release(struct kref *ref) xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -424,7 +424,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -687,7 +687,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -736,3 +736,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..6e74f85b6264 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 8f7b427da4f6..1752d7c4f4f2 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -47,7 +47,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +117,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +365,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, up_read(¤t->mm->mmap_sem); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 3e03d533231c..6e8e72af936a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -294,7 +294,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -609,19 +609,15 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; struct sgx_numa_node *node = section->node; - int ret; - - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); - - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; spin_lock(&node->lock); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 2a2b5c857451..65b2e88dbedd 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -13,6 +13,10 @@ #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 -- Gitee From e1879ed612b0a73f5380d85fb7ac2b15ee541cec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:20 +1300 Subject: [PATCH 033/134] x86/sgx: Add SGX_CHILD_PRESENT hardware error code commit 231d3dbdda192e3b3c7b79f4c3b0616f6c7f31b7 upstream. SGX driver can accurately track how enclave pages are used. This enables SECS to be specifically targeted and EREMOVE'd only after all child pages have been EREMOVE'd. This ensures that SGX driver will never encounter SGX_CHILD_PRESENT in normal operation. Virtual EPC is different. The host does not track how EPC pages are used by the guest, so it cannot guarantee EREMOVE success. It might, for instance, encounter a SECS with a non-zero child count. Add a definition of SGX_CHILD_PRESENT. It will be used exclusively by the SGX virtualization driver to handle recoverable EREMOVE errors when saniziting EPC pages after they are freed. Intel-SIG: commit 231d3dbdda19 x86/sgx: Add SGX_CHILD_PRESENT hardware error code. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/050b198e882afde7e6eba8e6a0d4da39161dbb5a.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index d96e5451d28a..14bb5f7e221c 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -31,12 +31,14 @@ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_UNMASKED_EVENT = 128, }; -- Gitee From fcee579ab5a92761acbb3b0a68e4285c5a72bead Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:21 +1300 Subject: [PATCH 034/134] x86/sgx: Introduce virtual EPC for use by KVM guests commit 540745ddbc70eabdc7dbd3fcc00fe4fb17cd59ba upstream. Add a misc device /dev/sgx_vepc to allow userspace to allocate "raw" Enclave Page Cache (EPC) without an associated enclave. The intended and only known use case for raw EPC allocation is to expose EPC to a KVM guest, hence the 'vepc' moniker, virt.{c,h} files and X86_SGX_KVM Kconfig. The SGX driver uses the misc device /dev/sgx_enclave to support userspace in creating an enclave. Each file descriptor returned from opening /dev/sgx_enclave represents an enclave. Unlike the SGX driver, KVM doesn't control how the guest uses the EPC, therefore EPC allocated to a KVM guest is not associated with an enclave, and /dev/sgx_enclave is not suitable for allocating EPC for a KVM guest. Having separate device nodes for the SGX driver and KVM virtual EPC also allows separate permission control for running host SGX enclaves and KVM SGX guests. To use /dev/sgx_vepc to allocate a virtual EPC instance with particular size, the hypervisor opens /dev/sgx_vepc, and uses mmap() with the intended size to get an address range of virtual EPC. Then it may use the address range to create one KVM memory slot as virtual EPC for a guest. Implement the "raw" EPC allocation in the x86 core-SGX subsystem via /dev/sgx_vepc rather than in KVM. Doing so has two major advantages: - Does not require changes to KVM's uAPI, e.g. EPC gets handled as just another memory backend for guests. - EPC management is wholly contained in the SGX subsystem, e.g. SGX does not have to export any symbols, changes to reclaim flows don't need to be routed through KVM, SGX's dirty laundry doesn't have to get aired out for the world to see, and so on and so forth. The virtual EPC pages allocated to guests are currently not reclaimable. Reclaiming an EPC page used by enclave requires a special reclaim mechanism separate from normal page reclaim, and that mechanism is not supported for virutal EPC pages. Due to the complications of handling reclaim conflicts between guest and host, reclaiming virtual EPC pages is significantly more complex than basic support for SGX virtualization. [ bp: - Massage commit message and comments - use cpu_feature_enabled() - vertically align struct members init - massage Virtual EPC clarification text - move Kconfig prompt to Virtualization ] Intel-SIG: commit 540745ddbc70 x86/sgx: Introduce virtual EPC for use by KVM guests. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/0c38ced8c8e5a69872db4d6a1c0dabd01e07cad7.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 16 ++ arch/x86/kernel/cpu/sgx/Makefile | 1 + arch/x86/kernel/cpu/sgx/sgx.h | 9 ++ arch/x86/kernel/cpu/sgx/virt.c | 259 +++++++++++++++++++++++++++++++ arch/x86/kvm/Kconfig | 12 ++ 5 files changed, 297 insertions(+) create mode 100644 arch/x86/kernel/cpu/sgx/virt.c diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index f90076e67cde..dd0ac96ff9ef 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -234,3 +234,19 @@ As a result, when this happpens, user should stop running any new SGX workloads, (or just any new workloads), and migrate all valuable workloads. Although a machine reboot can recover all EPC memory, the bug should be reported to Linux developers. + + +Virtual EPC +=========== + +The implementation has also a virtual EPC driver to support SGX enclaves +in guests. Unlike the SGX driver, an EPC page allocated by the virtual +EPC driver doesn't have a specific enclave associated with it. This is +because KVM doesn't track how a guest uses EPC pages. + +As a result, the SGX core page reclaimer doesn't support reclaiming EPC +pages allocated to KVM guests through the virtual EPC driver. If the +user wants to deploy SGX applications both on the host and in guests +on the same machine, the user should reserve enough EPC (by taking out +total virtual EPC size of all SGX VMs from the physical EPC size) for +host SGX applications so they can run with acceptable performance. diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 65b2e88dbedd..e4cbc71bf136 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -84,4 +84,13 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..259cc46ad78c --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret; + + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + + return 0; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 840e12583b85..9c3bb777ff21 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -71,6 +71,18 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL + help + + Enables KVM guests to create SGX enclaves. + + This includes support to expose "raw" unreclaimable enclave memory to + guests via a device node, e.g. /dev/sgx_vepc. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM -- Gitee From ae068fcddd8d90d83bfcde39f2c660f2a1914eb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:22:58 +1300 Subject: [PATCH 035/134] x86/cpu/intel: Allow SGX virtualization without Launch Control support commit 332bfc7becf479de8a55864cc5ed0024baea28aa upstream. The kernel will currently disable all SGX support if the hardware does not support launch control. Make it more permissive to allow SGX virtualization on systems without Launch Control support. This will allow KVM to expose SGX to guests that have less-strict requirements on the availability of flexible launch control. Improve error message to distinguish between three cases. There are two cases where SGX support is completely disabled: 1) SGX has been disabled completely by the BIOS 2) SGX LC is locked by the BIOS. Bare-metal support is disabled because of LC unavailability. SGX virtualization is unavailable (because of Kconfig). One where it is partially available: 3) SGX LC is locked by the BIOS. Bare-metal support is disabled because of LC unavailability. SGX virtualization is supported. Intel-SIG: commit 332bfc7becf4 x86/cpu/intel: Allow SGX virtualization without Launch Control support. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/b3329777076509b3b601550da288c8f3c406a865.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/feat_ctl.c | 59 +++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index d9c736b0bdcf..c61b51de1f1c 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -16,7 +16,8 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { - bool enable_sgx; + bool enable_sgx_kvm = false, enable_sgx_driver = false; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { @@ -24,13 +25,19 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_sgx; @@ -46,23 +53,45 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot_enabled()) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } -- Gitee From 91ac9a23e3a24bce8f0fb304d7ea325614f96a39 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 19 Mar 2021 20:23:02 +1300 Subject: [PATCH 036/134] x86/sgx: Initialize virtual EPC driver even when SGX driver is disabled commit faa7d3e6f3b983a28bf0f88f82dcb1c162e61105 upstream. Modify sgx_init() to always try to initialize the virtual EPC driver, even if the SGX driver is disabled. The SGX driver might be disabled if SGX Launch Control is in locked mode, or not supported in the hardware at all. This allows (non-Linux) guests that support non-LC configurations to use SGX. [ bp: De-silli-fy the test. ] Intel-SIG: commit faa7d3e6f3b9 x86/sgx: Initialize virtual EPC driver even when SGX driver is disabled. Backport for SGX virtualization support. Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/d35d17a02bbf8feef83a536cec8b43746d4ea557.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 6e8e72af936a..7a99549cba1b 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -743,8 +743,17 @@ static int __init sgx_init(void) goto err_page_cache; } + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ ret = sgx_drv_init(); - if (ret) + + if (sgx_vepc_init() && ret) goto err_kthread; return 0; -- Gitee From a66dedaacf33608d8703babe16f107c0b632c47b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:04 +1300 Subject: [PATCH 037/134] x86/sgx: Move ENCLS leaf definitions to sgx.h commit 9c55c78a73ce6e62a1d46ba6e4f242c23c29b812 upstream. Move the ENCLS leaf definitions to sgx.h so that they can be used by KVM. Intel-SIG: commit 9c55c78a73ce x86/sgx: Move ENCLS leaf definitions to sgx.h. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/2e6cd7c5c1ced620cfcd292c3c6c382827fde6b2.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 15 +++++++++++++++ arch/x86/kernel/cpu/sgx/encls.h | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 14bb5f7e221c..34f44238d1d1 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -27,6 +27,21 @@ /* The bitmask for the EPC section type. */ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, +}; + /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..be5c49689980 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * -- Gitee From 143376ccea720cac3d7872d190d642fe5d8b2346 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:05 +1300 Subject: [PATCH 038/134] x86/sgx: Add SGX2 ENCLS leaf definitions (EAUG, EMODPR and EMODT) commit 32ddda8e445df3de477db14d386fb3518042224a upstream. Define the ENCLS leafs that are available with SGX2, also referred to as Enclave Dynamic Memory Management (EDMM). The leafs will be used by KVM to conditionally expose SGX2 capabilities to guests. Intel-SIG: commit 32ddda8e445d x86/sgx: Add SGX2 ENCLS leaf definitions (EAUG, EMODPR and EMODT). Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/5f0970c251ebcc6d5add132f0d750cc753b7060f.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 34f44238d1d1..3b025afec0a7 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -40,6 +40,9 @@ enum sgx_encls_function { EPA = 0x0A, EWB = 0x0B, ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, }; /** -- Gitee From 4e2616867724fdb8088725576d6d28e411d3a433 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:06 +1300 Subject: [PATCH 039/134] x86/sgx: Add encls_faulted() helper commit a67136b458e5e63822b19c35794451122fe2bf3e upstream. Add a helper to extract the fault indicator from an encoded ENCLS return value. SGX virtualization will also need to detect ENCLS faults. Intel-SIG: commit a67136b458e5 x86/sgx: Add encls_faulted() helper. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/c1f955898110de2f669da536fc6cf62e003dff88.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encls.h | 15 ++++++++++++++- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index be5c49689980..9b204843b78d 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -40,6 +40,19 @@ } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -50,7 +63,7 @@ */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 1752d7c4f4f2..549c0f1ba0b7 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); -- Gitee From 5eebd620e723e9b07f9a2830c2de8e655b51f055 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 19 Mar 2021 20:23:07 +1300 Subject: [PATCH 040/134] x86/sgx: Add helper to update SGX_LEPUBKEYHASHn MSRs commit 73916b6a0c714258f9c2619408a66c6696a761a7 upstream. Add a helper to update SGX_LEPUBKEYHASHn MSRs. SGX virtualization also needs to update those MSRs based on guest's "virtual" SGX_LEPUBKEYHASHn before EINIT from guest. Intel-SIG: commit 73916b6a0c71 x86/sgx: Add helper to update SGX_LEPUBKEYHASHn MSRs. Backport for SGX virtualization support. Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/dfb7cd39d4dd62ea27703b64afdd8bccb579f623.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/ioctl.c | 5 ++--- arch/x86/kernel/cpu/sgx/main.c | 16 ++++++++++++++++ arch/x86/kernel/cpu/sgx/sgx.h | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 549c0f1ba0b7..c86232ce8063 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -495,7 +495,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +544,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 7a99549cba1b..a6dad7c3099f 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -727,6 +727,22 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + static int __init sgx_init(void) { int ret; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index e4cbc71bf136..4628acec0009 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -93,4 +93,6 @@ static inline int __init sgx_vepc_init(void) } #endif +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ -- Gitee From 75f3c935df55a19a4ade3250244429d1cc2e94f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:08 +1300 Subject: [PATCH 041/134] x86/sgx: Add helpers to expose ECREATE and EINIT to KVM commit d155030b1e7c0e448aab22a803f7a71ea2e117d7 upstream. The host kernel must intercept ECREATE to impose policies on guests, and intercept EINIT to be able to write guest's virtual SGX_LEPUBKEYHASH MSR values to hardware before running guest's EINIT so it can run correctly according to hardware behavior. Provide wrappers around __ecreate() and __einit() to hide the ugliness of overloading the ENCLS return value to encode multiple error formats in a single int. KVM will trap-and-execute ECREATE and EINIT as part of SGX virtualization, and reflect ENCLS execution result to guest by setting up guest's GPRs, or on an exception, injecting the correct fault based on return value of __ecreate() and __einit(). Use host userspace addresses (provided by KVM based on guest physical address of ENCLS parameters) to execute ENCLS/EINIT when possible. Accesses to both EPC and memory originating from ENCLS are subject to segmentation and paging mechanisms. It's also possible to generate kernel mappings for ENCLS parameters by resolving PFN but using __uaccess_xx() is simpler. [ bp: Return early if the __user memory accesses fail, use cpu_feature_enabled(). ] Intel-SIG: commit d155030b1e7c x86/sgx: Add helpers to expose ECREATE and EINIT to KVM. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/20e09daf559aa5e9e680a0b4b5fba940f1bad86e.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 7 ++ arch/x86/kernel/cpu/sgx/virt.c | 117 +++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 3b025afec0a7..954042e04102 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -365,4 +365,11 @@ struct sgx_sigstruct { * comment! */ +#ifdef CONFIG_X86_SGX_KVM +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr); +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + #endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 259cc46ad78c..7d221eac716a 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -257,3 +257,120 @@ int __init sgx_vepc_init(void) return misc_register(&sgx_vepc_dev); } + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); -- Gitee From 2e9035ec4fe5cbd125aa984b876523beef28ca78 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:09 +1300 Subject: [PATCH 042/134] x86/sgx: Move provisioning device creation out of SGX driver commit b3754e5d3da320af2bebb7a690002685c7f5c15c upstream. And extract sgx_set_attribute() out of sgx_ioc_enclave_provision() and export it as symbol for KVM to use. The provisioning key is sensitive. The SGX driver only allows to create an enclave which can access the provisioning key when the enclave creator has permission to open /dev/sgx_provision. It should apply to a VM as well, as the provisioning key is platform-specific, thus an unrestricted VM can also potentially compromise the provisioning key. Move the provisioning device creation out of sgx_drv_init() to sgx_init() as a preparation for adding SGX virtualization support, so that even if the SGX driver is not enabled due to flexible launch control not being available, SGX virtualization can still be enabled, and use it to restrict a VM's capability of being able to access the provisioning key. [ bp: Massage commit message. ] Intel-SIG: commit b3754e5d3da3 x86/sgx: Move provisioning device creation out of SGX driver. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/0f4d044d621561f26d5f4ef73e8dc6cd18cc7e79.1616136308.git.kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 3 ++ arch/x86/kernel/cpu/sgx/driver.c | 17 ---------- arch/x86/kernel/cpu/sgx/ioctl.c | 16 ++------- arch/x86/kernel/cpu/sgx/main.c | 57 +++++++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 954042e04102..a16e2c9154a3 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -372,4 +372,7 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token, void __user *secs, u64 *lepubkeyhash, int *trapnr); #endif +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd); + #endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index c86232ce8063..ba716f819bd4 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include +#include #include #include #include @@ -666,24 +667,11 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index a6dad7c3099f..7d0aac46e25e 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include #include #include #include +#include #include #include #include #include #include +#include #include "driver.h" #include "encl.h" #include "encls.h" @@ -743,6 +746,51 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -759,6 +807,10 @@ static int __init sgx_init(void) goto err_page_cache; } + ret = misc_register(&sgx_dev_provision); + if (ret) + goto err_kthread; + /* * Always try to initialize the native *and* KVM drivers. * The KVM driver is less picky than the native one and @@ -770,10 +822,13 @@ static int __init sgx_init(void) ret = sgx_drv_init(); if (sgx_vepc_init() && ret) - goto err_kthread; + goto err_provision; return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); -- Gitee From cfc465c7f7b9eaff9cc81b912531600e526d8320 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:33 +1200 Subject: [PATCH 043/134] KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for SGX (VMX) commit 54f958cdaa8c43c0e9b9ef850ae613a6e1bda44e upstream. Export the gva_to_gpa() helpers for use by SGX virtualization when executing ENCLS[ECREATE] and ENCLS[EINIT] on behalf of the guest. To execute ECREATE and EINIT, KVM must obtain the GPA of the target Secure Enclave Control Structure (SECS) in order to get its corresponding HVA. Because the SECS must reside in the Enclave Page Cache (EPC), copying the SECS's data to a host-controlled buffer via existing exported helpers is not a viable option as the EPC is not readable or writable by the kernel. SGX virtualization will also use gva_to_gpa() to obtain HVAs for non-EPC pages in order to pass user pointers directly to ECREATE and EINIT, which avoids having to copy pages worth of data into the kernel. Intel-SIG: commit 54f958cdaa8c KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for SGX (VMX). Backport for SGX virtualization support Signed-off-by: Sean Christopherson Acked-by: Jarkko Sakkinen Signed-off-by: Kai Huang Message-Id: <02f37708321bcdfaa2f9d41c8478affa6e84b04d.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b3e051858689..a7a362755c29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5583,6 +5583,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -5599,6 +5600,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, -- Gitee From e1a13cd8f4ea387ce62ad5bc76e4ee1c0dd1848e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:34 +1200 Subject: [PATCH 044/134] KVM: x86: Define new #PF SGX error code bit commit 00e7646c3563d2f1a46a8fa1824c32373d77a8be upstream. Page faults that are signaled by the SGX Enclave Page Cache Map (EPCM), as opposed to the traditional IA32/EPT page tables, set an SGX bit in the error code to indicate that the #PF was induced by SGX. KVM will need to emulate this behavior as part of its trap-and-execute scheme for virtualizing SGX Launch Control, e.g. to inject SGX-induced #PFs if EINIT faults in the host, and to support live migration. Intel-SIG: commit 00e7646c3563 KVM: x86: Define new #PF SGX error code bit. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c3948b936db7..9294927e7257 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -220,6 +220,7 @@ enum exit_fastpath_completion { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 @@ -229,6 +230,7 @@ enum exit_fastpath_completion { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -- Gitee From 1bb72cb004ee247dcc1b5a54707e0bef2e2af293 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:36 +1200 Subject: [PATCH 045/134] KVM: x86: Add reverse-CPUID lookup support for scattered SGX features commit 01de8682b32d3ed4f0136f7379e1e3ae2e563308 upstream. Define a new KVM-only feature word for advertising and querying SGX sub-features in CPUID.0x12.0x0.EAX. Because SGX1 and SGX2 are scattered in the kernel's feature word, they need to be translated so that the bit numbers match those of hardware. Intel-SIG: commit 01de8682b32d KVM: x86: Add reverse-CPUID lookup support for scattered SGX features. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d2bd1b4e5d51..2281280a4259 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -12,12 +12,17 @@ * "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { - NR_KVM_CPU_CAPS = NCAPINTS, + CPUID_12_EAX = NCAPINTS, + NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; -#define X86_KVM_FEATURE(w, f) ((w)*32 + (f)) +#define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) + +/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ +#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) +#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); @@ -72,6 +77,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, + [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, }; /* @@ -98,6 +104,11 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) */ static __always_inline u32 __feature_translate(int x86_feature) { + if (x86_feature == X86_FEATURE_SGX1) + return KVM_X86_FEATURE_SGX1; + else if (x86_feature == X86_FEATURE_SGX2) + return KVM_X86_FEATURE_SGX2; + return x86_feature; } -- Gitee From abdf6a25249dea11c540a544f361ed61edfb27f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:37 +1200 Subject: [PATCH 046/134] KVM: VMX: Add basic handling of VM-Exit from SGX enclave commit 3c0c2ad1ae75963c05bf89ec91918c6a53a72696 upstream. Add support for handling VM-Exits that originate from a guest SGX enclave. In SGX, an "enclave" is a new CPL3-only execution environment, wherein the CPU and memory state is protected by hardware to make the state inaccesible to code running outside of the enclave. When exiting an enclave due to an asynchronous event (from the perspective of the enclave), e.g. exceptions, interrupts, and VM-Exits, the enclave's state is automatically saved and scrubbed (the CPU loads synthetic state), and then reloaded when re-entering the enclave. E.g. after an instruction based VM-Exit from an enclave, vmcs.GUEST_RIP will not contain the RIP of the enclave instruction that trigered VM-Exit, but will instead point to a RIP in the enclave's untrusted runtime (the guest userspace code that coordinates entry/exit to/from the enclave). To help a VMM recognize and handle exits from enclaves, SGX adds bits to existing VMCS fields, VM_EXIT_REASON.VMX_EXIT_REASON_FROM_ENCLAVE and GUEST_INTERRUPTIBILITY_INFO.GUEST_INTR_STATE_ENCLAVE_INTR. Define the new architectural bits, and add a boolean to struct vcpu_vmx to cache VMX_EXIT_REASON_FROM_ENCLAVE. Clear the bit in exit_reason so that checks against exit_reason do not need to account for SGX, e.g. "if (exit_reason == EXIT_REASON_EXCEPTION_NMI)" continues to work. KVM is a largely a passive observer of the new bits, e.g. KVM needs to account for the bits when propagating information to a nested VMM, but otherwise doesn't need to act differently for the majority of VM-Exits from enclaves. The one scenario that is directly impacted is emulation, which is for all intents and purposes impossible[1] since KVM does not have access to the RIP or instruction stream that triggered the VM-Exit. The inability to emulate is a non-issue for KVM, as most instructions that might trigger VM-Exit unconditionally #UD in an enclave (before the VM-Exit check. For the few instruction that conditionally #UD, KVM either never sets the exiting control, e.g. PAUSE_EXITING[2], or sets it if and only if the feature is not exposed to the guest in order to inject a #UD, e.g. RDRAND_EXITING. But, because it is still possible for a guest to trigger emulation, e.g. MMIO, inject a #UD if KVM ever attempts emulation after a VM-Exit from an enclave. This is architecturally accurate for instruction VM-Exits, and for MMIO it's the least bad choice, e.g. it's preferable to killing the VM. In practice, only broken or particularly stupid guests should ever encounter this behavior. Add a WARN in skip_emulated_instruction to detect any attempt to modify the guest's RIP during an SGX enclave VM-Exit as all such flows should either be unreachable or must handle exits from enclaves before getting to skip_emulated_instruction. [1] Impossible for all practical purposes. Not truly impossible since KVM could implement some form of para-virtualization scheme. [2] PAUSE_LOOP_EXITING only affects CPL0 and enclaves exist only at CPL3, so we also don't need to worry about that interaction. Intel-SIG: commit 3c0c2ad1ae75 KVM: VMX: Add basic handling of VM-Exit from SGX enclave Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <315f54a8507d09c292463ef29104e1d4c62e9090.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/vmx.h | 1 + arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 45 +++++++++++++++++++++++++++++++-- 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 083910dc5011..2b26d6ede9d6 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -382,6 +382,7 @@ enum vmcs_field { #define GUEST_INTR_STATE_MOV_SS 0x00000002 #define GUEST_INTR_STATE_SMI 0x00000004 #define GUEST_INTR_STATE_NMI 0x00000008 +#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010 /* GUEST_ACTIVITY_STATE flags */ #define GUEST_ACTIVITY_ACTIVE 0 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 66d8f64d75a9..5c115b8e950a 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 +#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e9ca618c8ed4..bffc951be851 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3749,6 +3749,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = exit_reason; + if (to_vmx(vcpu)->exit_reason.enclave_mode) + vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6c79798d2da2..e6f04d3716f7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1572,12 +1572,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { + /* + * Emulation of instructions in SGX enclaves is impossible as RIP does + * not point tthe failing instruction, and even if it did, the code + * stream is inaccessible. Inject #UD instead of exiting to userspace + * so that guest userspace can't DoS the guest simply by triggering + * emulation (enclaves are CPL3 only). + */ + if (to_vmx(vcpu)->exit_reason.enclave_mode) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } return true; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { + union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; unsigned long rip, orig_rip; + u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1588,9 +1601,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + /* + * Emulating an enclave's instructions isn't supported as KVM + * cannot access the enclave's memory or its true RIP, e.g. the + * vmcs.GUEST_RIP points at the exit point of the enclave, not + * the RIP that actually triggered the VM-Exit. But, because + * most instructions that cause VM-Exit will #UD in an enclave, + * most instruction-based VM-Exits simply do not occur. + * + * There are a few exceptions, notably the debug instructions + * INT1ICEBRK and INT3, as they are allowed in debug enclaves + * and generate #DB/#BP as expected, which KVM might intercept. + * But again, the CPU does the dirty work and saves an instr + * length of zero so VMMs don't shoot themselves in the foot. + * WARN if KVM tries to skip a non-zero length instruction on + * a VM-Exit from an enclave. + */ + if (!instr_len) + goto rip_updated; + + WARN(exit_reason.enclave_mode, + "KVM: skipping instruction after SGX enclave VM-Exit"); + orig_rip = kvm_rip_read(vcpu); - rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit @@ -1606,6 +1643,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 0; } +rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); @@ -5378,6 +5416,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; + if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + return 1; + /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. -- Gitee From 0942a1108037c181510d6bb354b4e872d348f101 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:38 +1200 Subject: [PATCH 047/134] KVM: VMX: Frame in ENCLS handler for SGX virtualization commit 9798adbc04cf1b14325dc7e2c882639693516a69 upstream. Introduce sgx.c and sgx.h, along with the framework for handling ENCLS VM-Exits. Add a bool, enable_sgx, that will eventually be wired up to a module param to control whether or not SGX virtualization is enabled at runtime. Intel-SIG: commit 9798adbc04cf KVM: VMX: Frame in ENCLS handler for SGX virtualization. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <1c782269608b2f5e1034be450f375a8432fb705d.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/Makefile | 2 ++ arch/x86/kvm/vmx/sgx.c | 50 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/sgx.h | 15 +++++++++++++ arch/x86/kvm/vmx/vmx.c | 9 +++++--- 4 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 arch/x86/kvm/vmx/sgx.c create mode 100644 arch/x86/kvm/vmx/sgx.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31ecf7a76d5a..401e0fcf39a6 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,6 +13,8 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ hyperv.o page_track.o debugfs.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o + kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index 000000000000..d874eb180b7d --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. */ + +#include + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "sgx.h" +#include "vmx.h" +#include "x86.h" + +bool __read_mostly enable_sgx; + +static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) +{ + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + return false; + + if (leaf >= ECREATE && leaf <= ETRACK) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + + if (leaf >= EAUG && leaf <= EMODT) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + + return false; +} + +static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) +{ + const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; + + return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; +} + +int handle_encls(struct kvm_vcpu *vcpu) +{ + u32 leaf = (u32)kvm_rax_read(vcpu); + + if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + kvm_queue_exception(vcpu, UD_VECTOR); + } else if (!sgx_enabled_in_guest_bios(vcpu)) { + kvm_inject_gp(vcpu, 0); + } else { + WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; + return 0; + } + return 1; +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h new file mode 100644 index 000000000000..6e17ecd4aca3 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SGX_H +#define __KVM_X86_SGX_H + +#include + +#ifdef CONFIG_X86_SGX_KVM +extern bool __read_mostly enable_sgx; + +int handle_encls(struct kvm_vcpu *vcpu); +#else +#define enable_sgx 0 +#endif + +#endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e6f04d3716f7..0d4a9d0dd997 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -59,6 +59,7 @@ #include "nested.h" #include "ops.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" @@ -5754,16 +5755,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. + * SGX virtualization is disabled. There is no software enable bit for + * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent + * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +#endif /* CONFIG_X86_SGX_KVM */ static int handle_enqcmd_pasid(struct kvm_vcpu *vcpu) { -- Gitee From 4bed1901668612a9812ebf53cbe03f6ff86a4a85 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:39 +1200 Subject: [PATCH 048/134] KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions commit 70210c044b4ea8f05e93ec62abc30cab4233ec88 upstream. Add an ECREATE handler that will be used to intercept ECREATE for the purpose of enforcing and enclave's MISCSELECT, ATTRIBUTES and XFRM, i.e. to allow userspace to restrict SGX features via CPUID. ECREATE will be intercepted when any of the aforementioned masks diverges from hardware in order to enforce the desired CPUID model, i.e. inject #GP if the guest attempts to set a bit that hasn't been enumerated as allowed-1 in CPUID. Note, access to the PROVISIONKEY is not yet supported. Intel-SIG: commit 70210c044b4e KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/vmx/sgx.c | 275 ++++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9294927e7257..6c35b830c5fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -958,6 +958,9 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Guest can access the SGX PROVISIONKEY. */ + bool sgx_provisioning_allowed; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; /* diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index d874eb180b7d..2cf9d7cf818c 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -11,6 +11,279 @@ bool __read_mostly enable_sgx; +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, + int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ + *gva = offset; + if (!is_long_mode(vcpu)) { + vmx_get_segment(vcpu, &s, VCPU_SREG_DS); + *gva += s.base; + } + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (likely(is_long_mode(vcpu))) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0xffffffff; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0xffffffff) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, + unsigned int size) +{ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = addr; + vcpu->run->internal.data[1] = size; +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + sgx_handle_emulation_failure(vcpu, hva, size); + return -EFAULT; + } + + return 0; +} + +static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, + gpa_t *gpa) +{ + struct x86_exception ex; + + if (write) + *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); + else + *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); + + if (*gpa == UNMAPPED_GVA) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return -EFAULT; + } + + return 0; +} + +static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) +{ + *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); + if (kvm_is_error_hva(*hva)) { + sgx_handle_emulation_failure(vcpu, gpa, 1); + return -EFAULT; + } + + *hva |= gpa & ~PAGE_MASK; + + return 0; +} + +static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) +{ + struct x86_exception ex; + + /* + * A non-EPCM #PF indicates a bad userspace HVA. This *should* check + * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, + * but the error code isn't (yet) plumbed through the ENCLS helpers. + */ + if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* + * If the guest thinks it's running on SGX2 hardware, inject an SGX + * #PF if the fault matches an EPCM fault signature (#GP on SGX1, + * #PF on SGX2). The assumption is that EPCM faults are much more + * likely than a bad userspace address. + */ + if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && + guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + memset(&ex, 0, sizeof(ex)); + ex.vector = PF_VECTOR; + ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | + PFERR_SGX_MASK; + ex.address = gva; + ex.error_code_valid = true; + ex.nested_page_fault = false; + kvm_inject_page_fault(vcpu, &ex); + } else { + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, + struct sgx_pageinfo *pageinfo, + unsigned long secs_hva, + gva_t secs_gva) +{ + struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; + struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; + u64 attributes, xfrm, size; + u32 miscselect; + u8 max_size_log2; + int trapnr, ret; + + sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!sgx_12_0 || !sgx_12_1) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + miscselect = contents->miscselect; + attributes = contents->attributes; + xfrm = contents->xfrm; + size = contents->size; + + /* Enforce restriction of access to the PROVISIONKEY. */ + if (!vcpu->kvm->arch.sgx_provisioning_allowed && + (attributes & SGX_ATTR_PROVISIONKEY)) { + if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) + pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + if ((u32)miscselect & ~sgx_12_0->ebx || + (u32)attributes & ~sgx_12_1->eax || + (u32)(attributes >> 32) & ~sgx_12_1->ebx || + (u32)xfrm & ~sgx_12_1->ecx || + (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restriction on max enclave size. */ + max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : + sgx_12_0->edx; + if (size >= BIT_ULL(max_size_log2)) + kvm_inject_gp(vcpu, 0); + + /* + * sgx_virt_ecreate() returns: + * 1) 0: ECREATE was successful + * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the + * exception number. + * 3) -EINVAL: access_ok() on @secs_hva failed. This should never + * happen as KVM checks host addresses at memslot creation. + * sgx_virt_ecreate() has already warned in this case. + */ + ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); + if (!ret) + return kvm_skip_emulated_instruction(vcpu); + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + return ret; +} + +static int handle_encls_ecreate(struct kvm_vcpu *vcpu) +{ + gva_t pageinfo_gva, secs_gva; + gva_t metadata_gva, contents_gva; + gpa_t metadata_gpa, contents_gpa, secs_gpa; + unsigned long metadata_hva, contents_hva, secs_hva; + struct sgx_pageinfo pageinfo; + struct sgx_secs *contents; + struct x86_exception ex; + int r; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) + return 1; + + /* + * Copy the PAGEINFO to local memory, its pointers need to be + * translated, i.e. we need to do a deep copy/translate. + */ + r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, + sizeof(pageinfo), &ex); + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return 1; + } else if (r != X86EMUL_CONTINUE) { + sgx_handle_emulation_failure(vcpu, pageinfo_gva, + sizeof(pageinfo)); + return 0; + } + + if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || + sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, + &contents_gva)) + return 1; + + /* + * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || + sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. + */ + if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || + sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) + return 0; + + /* + * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the + * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and + * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to + * enforce restriction of access to the PROVISIONKEY. + */ + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!contents) + return -ENOMEM; + + /* Exit to userspace if copying from a host userspace address fails. */ + if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { + free_page((unsigned long)contents); + return 0; + } + + pageinfo.metadata = metadata_hva; + pageinfo.contents = (u64)contents; + + r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); + + free_page((unsigned long)contents); + + return r; +} + static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) @@ -41,6 +314,8 @@ int handle_encls(struct kvm_vcpu *vcpu) } else if (!sgx_enabled_in_guest_bios(vcpu)) { kvm_inject_gp(vcpu, 0); } else { + if (leaf == ECREATE) + return handle_encls_ecreate(vcpu); WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; -- Gitee From e1013bc67c8e4218e8e6c00f37b0dc382cf44bd9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:40 +1200 Subject: [PATCH 049/134] KVM: VMX: Add emulation of SGX Launch Control LE hash MSRs commit 8f102445d4045384799627c53d82c45ca2cad3a5 upstream. Emulate the four Launch Enclave public key hash MSRs (LE hash MSRs) that exist on CPUs that support SGX Launch Control (LC). SGX LC modifies the behavior of ENCLS[EINIT] to use the LE hash MSRs when verifying the key used to sign an enclave. On CPUs without LC support, the LE hash is hardwired into the CPU to an Intel controlled key (the Intel key is also the reset value of the LE hash MSRs). Track the guest's desired hash so that a future patch can stuff the hash into the hardware MSRs when executing EINIT on behalf of the guest, when those MSRs are writable in host. Intel-SIG: commit 8f102445d404 KVM: VMX: Add emulation of SGX Launch Control LE hash MSRs. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Message-Id: [Add a comment regarding the MSRs being available until SGX is locked. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/vmx/sgx.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/sgx.h | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.h | 3 +++ 4 files changed, 75 insertions(+) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 2cf9d7cf818c..d53e6b17b320 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -11,6 +11,9 @@ bool __read_mostly enable_sgx; +/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ +static u64 sgx_pubkey_hash[4] __ro_after_init; + /* * ENCLS's memory operands use a fixed segment (DS) and a fixed * address size based on the mode. Related prefixes are ignored. @@ -323,3 +326,35 @@ int handle_encls(struct kvm_vcpu *vcpu) } return 1; } + +void setup_default_sgx_lepubkeyhash(void) +{ + /* + * Use Intel's default value for Skylake hardware if Launch Control is + * not supported, i.e. Intel's hash is hardcoded into silicon, or if + * Launch Control is supported and enabled, i.e. mimic the reset value + * and let the guest write the MSRs at will. If Launch Control is + * supported but disabled, then use the current MSR values as the hash + * MSRs exist but are read-only (locked and not writable). + */ + if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + } +} + +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h index 6e17ecd4aca3..6502fa52c7e9 100644 --- a/arch/x86/kvm/vmx/sgx.h +++ b/arch/x86/kvm/vmx/sgx.h @@ -8,8 +8,14 @@ extern bool __read_mostly enable_sgx; int handle_encls(struct kvm_vcpu *vcpu); + +void setup_default_sgx_lepubkeyhash(void); +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); #else #define enable_sgx 0 + +static inline void setup_default_sgx_lepubkeyhash(void) { } +static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } #endif #endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0d4a9d0dd997..02c1d44c2e22 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1904,6 +1904,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; case MSR_IA32_FEATURE_CONTROL: msr_info->data = vmx->msr_ia32_feature_control; break; @@ -2157,6 +2164,26 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + /* + * On real hardware, the LE hash MSRs are writable before + * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), + * at which point SGX related bits in IA32_FEATURE_CONTROL + * become writable. + * + * KVM does not emulate SGX activation for simplicity, so + * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL + * is unlocked. This is technically not architectural + * behavior, but it's close enough. + */ + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) return 1; /* they are read-only */ @@ -7049,6 +7076,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); + vcpu_setup_sgx_lepubkeyhash(&vmx->vcpu); + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8333,6 +8362,8 @@ static __init int hardware_setup(void) if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + setup_default_sgx_lepubkeyhash(); + if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 965d4dc31674..4c39f9c1f870 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -302,6 +302,9 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + /* SGX Launch Control public key hash */ + u64 msr_ia32_sgxlepubkeyhash[4]; + u64 ept_pointer; struct pt_desc pt_desc; -- Gitee From dc78d581ca0adc64d10ddf1d122c2d1122623ee6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:41 +1200 Subject: [PATCH 050/134] KVM: VMX: Add ENCLS[EINIT] handler to support SGX Launch Control (LC) commit b6f084ca553845135ccade79ce6548035e52884a upstream. Add a VM-Exit handler to trap-and-execute EINIT when SGX LC is enabled in the host. When SGX LC is enabled, the host kernel may rewrite the hardware values at will, e.g. to launch enclaves with different signers, thus KVM needs to intercept EINIT to ensure it is executed with the correct LE hash (even if the guest sees a hardwired hash). Switching the LE hash MSRs on VM-Enter/VM-Exit is not a viable option as writing the MSRs is prohibitively expensive, e.g. on SKL hardware each WRMSR is ~400 cycles. And because EINIT takes tens of thousands of cycles to execute, the ~1500 cycle overhead to trap-and-execute EINIT is unlikely to be noticed by the guest, let alone impact its overall SGX performance. Intel-SIG: commit b6f084ca5538 KVM: VMX: Add ENCLS[EINIT] handler to support SGX Launch Control (LC). Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <57c92fa4d2083eb3be9e6355e3882fc90cffea87.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/vmx/sgx.c | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index d53e6b17b320..2eed5da91698 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -287,6 +287,68 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu) return r; } +static int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva, rflags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + gpa_t sig_gpa, secs_gpa, token_gpa; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) + return 1; + + /* + * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || + sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. Note, all structures are aligned and + * cannot split pages. + */ + if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || + sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) + return 0; + + ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, + vmx->msr_ia32_sgxlepubkeyhash, &trapnr); + + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + /* + * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, + * @token_hva or @secs_hva. This should never happen as KVM checks host + * addresses at memslot creation. sgx_virt_einit() has already warned + * in this case, so just return. + */ + if (ret < 0) + return ret; + + rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | + X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_OF); + if (ret) + rflags |= X86_EFLAGS_ZF; + else + rflags &= ~X86_EFLAGS_ZF; + vmx_set_rflags(vcpu, rflags); + + kvm_rax_write(vcpu, ret); + return kvm_skip_emulated_instruction(vcpu); +} + static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) @@ -319,6 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu) } else { if (leaf == ECREATE) return handle_encls_ecreate(vcpu); + if (leaf == EINIT) + return handle_encls_einit(vcpu); WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; -- Gitee From 8a44fb28420a38ce9ee6458b232c6313ba781811 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:42 +1200 Subject: [PATCH 051/134] KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC commit 72add915fbd5bf5c57deee3da5b2605e966ac199 upstream. Enable SGX virtualization now that KVM has the VM-Exit handlers needed to trap-and-execute ENCLS to ensure correctness and/or enforce the CPU model exposed to the guest. Add a KVM module param, "sgx", to allow an admin to disable SGX virtualization independent of the kernel. When supported in hardware and the kernel, advertise SGX1, SGX2 and SGX LC to userspace via CPUID and wire up the ENCLS_EXITING bitmap based on the guest's SGX capabilities, i.e. to allow ENCLS to be executed in an SGX-enabled guest. With the exception of the provision key, all SGX attribute bits may be exposed to the guest. Guest access to the provision key, which is controlled via securityfs, will be added in a future patch. Note, KVM does not yet support exposing ENCLS_C leafs or ENCLV leafs. Intel-SIG: commit 72add915fbd5 KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC. Backport for SGX virtualization support. Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. - When the changes were made at v5.13, we have call path as below: kvm_vcpu_ioctl_set_cpuid2() -> kvm_set_cpuid() -> kvm_update_cpuid_runtime() -> kvm_vcpu_after_set_cpuid() chunk 1) -> static_call(kvm_x86_vcpu_after_set_cpuid)() => vmx_vcpu_after_set_cpuid() chunk 2) Now we have call path as below at v5.4: kvm_vcpu_ioctl_set_cpuid2() -> kvm_x86_ops->cpuid_update() => vmx_cpuid_update() chunk 2) -> kvm_update_cpuid() chunk 1) 1. Move chunk 1) in function kvm_vcpu_after_set_cpuid() to kvm_update_cpuid(). 2. Move chunk 2) in function vmx_vcpu_after_set_cpuid() to vmx_cpuid_update(). - In function nested_vmx_exit_reflected(), invert the return value of nested_vmx_exit_handled_encls() as per the old logic. - The commit 5a085326d51d ("KVM: VMX: Rename ops.h to vmx_ops.h") has not been backported, so including old ops.h. ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.c | 57 +++++++++++++++++++++++++++- arch/x86/kvm/vmx/nested.c | 27 +++++++++++-- arch/x86/kvm/vmx/nested.h | 5 +++ arch/x86/kvm/vmx/sgx.c | 80 ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/sgx.h | 13 +++++++ arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmcs12.h | 4 +- arch/x86/kvm/vmx/vmx.c | 26 ++++++++++++- 8 files changed, 204 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c22b913ab541..bcc7cd7f5b63 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -179,6 +180,21 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) MSR_IA32_MISC_ENABLE_MWAIT); } + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); + if (best) { + best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; + best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } + /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); @@ -397,7 +413,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | @@ -408,7 +424,8 @@ void kvm_set_cpu_caps(void) F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | + F(SGX_LC) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) @@ -429,6 +446,10 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); + kvm_cpu_cap_init_scattered(CPUID_12_EAX, + SF(SGX1) | SF(SGX2) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -743,6 +764,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; } + case 0x12: + /* Intel SGX */ + if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * Index 0: Sub-features, MISCSELECT (a.k.a extended features) + * and max enclave sizes. The SGX sub-features and MISCSELECT + * are restricted by kernel and KVM capabilities (like most + * feature flags), while enclave size is unrestricted. + */ + cpuid_entry_override(entry, CPUID_12_EAX); + entry->ebx &= SGX_MISC_EXINFO; + + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + /* + * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la + * feature flags. Advertise all supported flags, including + * privileged attributes that require explicit opt-in from + * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is + * expected to derive it from supported XCR0. + */ + entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | + /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY | + SGX_ATTR_KSS; + entry->ebx &= 0; + break; /* Intel PT */ case 0x14: if (!f_intel_pt) { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bffc951be851..64921ddb98c8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -10,6 +10,7 @@ #include "hyperv.h" #include "mmu.h" #include "nested.h" +#include "sgx.h" #include "trace.h" #include "x86.h" @@ -2137,6 +2138,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); + secondary_exec_controls_set(vmx, exec_control); } @@ -5307,6 +5311,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 encls_leaf; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) + return false; + + encls_leaf = kvm_rax_read(vcpu); + if (encls_leaf > 62) + encls_leaf = 63; + return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); +} + static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { @@ -5506,13 +5525,12 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return false; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return false; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); + case EXIT_REASON_ENCLS: + return !nested_vmx_exit_handled_encls(vcpu, vmcs12); default: return true; } @@ -5994,6 +6012,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (enable_sgx) + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 7e960f084c52..b719f61aa4bc 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -249,6 +249,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); +} + /* * if fixed0[i] == 1: val[i] must be 1 * if fixed1[i] == 0: val[i] must be 0 diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 2eed5da91698..6693ebdc0770 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -5,11 +5,13 @@ #include "cpuid.h" #include "kvm_cache_regs.h" +#include "nested.h" #include "sgx.h" #include "vmx.h" #include "x86.h" -bool __read_mostly enable_sgx; +bool __read_mostly enable_sgx = 1; +module_param_named(sgx, enable_sgx, bool, 0444); /* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ static u64 sgx_pubkey_hash[4] __ro_after_init; @@ -422,3 +424,79 @@ void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, sizeof(sgx_pubkey_hash)); } + +/* + * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM + * restrictions if the guest's allowed-1 settings diverge from hardware. + */ +static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *guest_cpuid; + u32 eax, ebx, ecx, edx; + + if (!vcpu->kvm->arch.sgx_provisioning_allowed) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || + guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) + return true; + + return false; +} + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* + * There is no software enable bit for SGX that is virtualized by + * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the + * guest (either by the host or by the guest's BIOS) but enabled in the + * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate + * the expected system behavior for ENCLS. + */ + u64 bitmap = -1ull; + + /* Nothing to do if hardware doesn't support SGX */ + if (!cpu_has_vmx_encls_vmexit()) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + sgx_enabled_in_guest_bios(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); + if (sgx_intercept_encls_ecreate(vcpu)) + bitmap |= (1 << ECREATE); + } + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + bitmap &= ~GENMASK_ULL(EMODT, EAUG); + + /* + * Trap and execute EINIT if launch control is enabled in the + * host using the guest's values for launch control MSRs, even + * if the guest's values are fixed to hardware default values. + * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing + * the MSRs is extraordinarily expensive. + */ + if (boot_cpu_has(X86_FEATURE_SGX_LC)) + bitmap |= (1 << EINIT); + + if (!vmcs12 && is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) + bitmap |= vmcs12->encls_exiting_bitmap; + } + vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h index 6502fa52c7e9..6c2e78141a6b 100644 --- a/arch/x86/kvm/vmx/sgx.h +++ b/arch/x86/kvm/vmx/sgx.h @@ -4,6 +4,9 @@ #include +#include "capabilities.h" +#include "ops.h" + #ifdef CONFIG_X86_SGX_KVM extern bool __read_mostly enable_sgx; @@ -11,11 +14,21 @@ int handle_encls(struct kvm_vcpu *vcpu); void setup_default_sgx_lepubkeyhash(void); void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); #else #define enable_sgx 0 static inline void setup_default_sgx_lepubkeyhash(void) { } static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } + +static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + /* Nothing to do if hardware doesn't support SGX */ + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} #endif #endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 53dfb401316d..355020944f6c 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VMREAD_BITMAP, vmread_bitmap), FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index d0c6df373f67..3f323d619ca1 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -69,7 +69,8 @@ struct __packed vmcs12 { u64 vm_function_control; u64 eptp_list_address; u64 pml_address; - u64 padding64[3]; /* room for future expansion */ + u64 encls_exiting_bitmap; + u64 padding64[2]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -259,6 +260,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 02c1d44c2e22..b02af575a735 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2163,6 +2163,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); + + /* SGX may be enabled/disabled by guest's firmware */ + vmx_write_encls_bitmap(vcpu, NULL); break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: /* @@ -4443,8 +4446,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (pt_mode == PT_MODE_HOST_GUEST) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); @@ -7658,6 +7660,19 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (cpu_has_vmx_pasid_trans() && guest_cpuid_has(vcpu, X86_FEATURE_ENQCMD)) vmx_vcpu_pasid_trans_init(vcpu); + + vmx_write_encls_bitmap(vcpu, NULL); + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_SGX_LC_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_SGX_LC_ENABLED; } /* @@ -7685,6 +7700,13 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_supported()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (!enable_sgx) { + kvm_cpu_cap_clear(X86_FEATURE_SGX); + kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); + kvm_cpu_cap_clear(X86_FEATURE_SGX1); + kvm_cpu_cap_clear(X86_FEATURE_SGX2); + } + /* PKU is not yet implemented for shadow paging. */ if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_check_and_set(X86_FEATURE_PKU); -- Gitee From 872b3b700e6d9988a8f389c9fd04004d3a61455a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:43 +1200 Subject: [PATCH 052/134] KVM: x86: Add capability to grant VM access to privileged SGX attribute commit fe7e948837f312d87853b3fce743795d1ae3715a upstream. Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace to grant a VM access to a priveleged attribute, with args[0] holding a file handle to a valid SGX attribute file. The SGX subsystem restricts access to a subset of enclave attributes to provide additional security for an uncompromised kernel, e.g. to prevent malware from using the PROVISIONKEY to ensure its nodes are running inside a geniune SGX enclave and/or to obtain a stable fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM restricts guest access to privileged attributes by default. Intel-SIG: commit fe7e948837f3 KVM: x86: Add capability to grant VM access to privileged SGX attribute. Backport for SGX virtualization support Cc: Andy Lutomirski Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <0b099d65e933e068e3ea934b0523bab070cb8cea.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- Documentation/virt/kvm/api.txt | 23 +++++++++++++++++++++++ arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index 3a2cee9af7f0..247df75b5552 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -5112,6 +5112,29 @@ it hard or impossible to use it correctly. The availability of KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. +7.25 KVM_CAP_SGX_ATTRIBUTE +---------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] is a file handle of a SGX attribute file in securityfs +:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested + attribute is not supported by KVM. + +KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or +more priveleged enclave attributes. args[0] must hold a file handle to a valid +SGX attribute file corresponding to an attribute that is supported/restricted +by KVM (currently only PROVISIONKEY). + +The SGX subsystem restricts access to a subset of enclave attributes to provide +additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY +is restricted to deter malware from using the PROVISIONKEY to obtain a stable +system fingerprint. To prevent userspace from circumventing such restrictions +by running an enclave in a VM, KVM prevents access to privileged attributes by +default. + +See Documentation/x86/sgx/2.Kernel-internals.rst for more details. + 7.32 KVM_CAP_MAX_VCPU_ID ------------------------ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bcc7cd7f5b63..01de416474c7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -792,7 +792,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * expected to derive it from supported XCR0. */ entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | - /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY | + SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | SGX_ATTR_KSS; entry->ebx &= 0; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7a362755c29..bcdfb48ccd25 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -3506,6 +3507,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: +#endif case KVM_CAP_SYS_ATTRIBUTES: r = 1; break; @@ -5070,6 +5074,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: { + unsigned long allowed_attributes = 0; + + r = sgx_set_attribute(&allowed_attributes, cap->args[0]); + if (r) + break; + + /* KVM only supports the PROVISIONKEY privileged attribute. */ + if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && + !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) + kvm->arch.sgx_provisioning_allowed = true; + else + r = -EINVAL; + break; + } +#endif case KVM_CAP_MAX_VCPU_ID: r = -EINVAL; if (cap->args[0] > KVM_MAX_VCPU_IDS) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c49bf3a938f0..528899d7d574 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1004,6 +1004,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 +#define KVM_CAP_SGX_ATTRIBUTE 196 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 -- Gitee From 144d0544d95cbf6dfb63b42b4cfebae45cdc65e6 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 8 Apr 2021 12:29:24 +0300 Subject: [PATCH 053/134] x86/sgx: Do not update sgx_nr_free_pages in sgx_setup_epc_section() commit ae40aaf6bdbf0354a75b8284a0de453fcf5f4d32 upstream. The commit in Fixes: changed the SGX EPC page sanitization to end up in sgx_free_epc_page() which puts clean and sanitized pages on the free list. This was done for the reason that it is best to keep the logic to assign available-for-use EPC pages to the correct NUMA lists in a single location. sgx_nr_free_pages is also incremented by sgx_free_epc_pages() but those pages which are being added there per EPC section do not belong to the free list yet because they haven't been sanitized yet - they land on the dirty list first and the sanitization happens later when ksgxd starts massaging them. So remove that addition there and have sgx_free_epc_page() do that solely. [ bp: Sanitize commit message too. ] Intel-SIG: commit ae40aaf6bdbf x86/sgx: Do not update sgx_nr_free_pages in sgx_setup_epc_section(). Backport for SGX virtualization support. Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list") Signed-off-by: Jarkko Sakkinen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210408092924.7032-1-jarkko@kernel.org Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 7d0aac46e25e..5527820fc8ec 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -656,7 +656,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - sgx_nr_free_pages += nr_pages; return true; } -- Gitee From 9bdf54ae0e573df9fd443ea038d3bac4878be35c Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 22:16:39 +1200 Subject: [PATCH 054/134] x86/sgx: Add missing xa_destroy() when virtual EPC is destroyed commit 4692bc775d2180a937335ccba0edce557103d44a upstream. xa_destroy() needs to be called to destroy a virtual EPC's page array before calling kfree() to free the virtual EPC. Currently it is not called so add the missing xa_destroy(). Intel-SIG: commit 4692bc775d21 x86/sgx: Add missing xa_destroy() when virtual EPC is destroyed. Backport for SGX virtualization support. Fixes: 540745ddbc70 ("x86/sgx: Introduce virtual EPC for use by KVM guests") Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Tested-by: Yang Zhong Link: https://lkml.kernel.org/r/20210615101639.291929-1-kai.huang@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/virt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 7d221eac716a..8f7dd1dc4189 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -212,6 +212,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) list_splice_tail(&secs_pages, &zombie_secs_pages); mutex_unlock(&zombie_secs_pages_lock); + xa_destroy(&vepc->page_array); kfree(vepc); return 0; -- Gitee From caae1a7132061b80d87773788ac820edd09155c7 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:14 -0700 Subject: [PATCH 055/134] x86/sgx: use vma_lookup() in sgx_encl_find() commit 9ce2c3fc0be6e7d0bb2236a33bbb7a0f1943bd81 upstream. Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Intel-SIG: commit 9ce2c3fc0be6 x86/sgx: use vma_lookup() in sgx_encl_find(). Backport for SGX virtualization support. Link: https://lkml.kernel.org/r/20210521174745.2219620-10-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 6e74f85b6264..fec43ca65065 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; -- Gitee From 30f39c701d6cb05da94f904ed22bd64eb0bb13c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Oct 2021 16:11:54 -0400 Subject: [PATCH 056/134] x86/sgx/virt: extract sgx_vepc_remove_page commit fd5128e622d7834bb3f7ee23c2bbea8db63cebaf upstream. For bare-metal SGX on real hardware, the hardware provides guarantees SGX state at reboot. For instance, all pages start out uninitialized. The vepc driver provides a similar guarantee today for freshly-opened vepc instances, but guests such as Windows expect all pages to be in uninitialized state on startup, including after every guest reboot. One way to do this is to simply close and reopen the /dev/sgx_vepc file descriptor and re-mmap the virtual EPC. However, this is problematic because it prevents sandboxing the userspace (for example forbidding open() after the guest starts; this is doable with heavy use of SCM_RIGHTS file descriptor passing). In order to implement this, we will need a ioctl that performs EREMOVE on all pages mapped by a /dev/sgx_vepc file descriptor: other possibilities, such as closing and reopening the device, are racy. Start the implementation by creating a separate function with just the __eremove wrapper. Intel-SIG: commit fd5128e622d7 x86/sgx/virt: extract sgx_vepc_remove_page. Backport for SGX virtualization support. Reviewed-by: Jarkko Sakkinen Reviewed-by: Dave Hansen Signed-off-by: Paolo Bonzini Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20211021201155.1523989-2-pbonzini@redhat.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/virt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 8f7dd1dc4189..09674c824ae9 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) { - int ret; - /* * Take a previously guest-owned EPC page and return it to the * general EPC page pool. @@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) * case that a guest properly EREMOVE'd this page, a superfluous * EREMOVE is harmless. */ - ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); if (ret) { /* * Only SGX_CHILD_PRESENT is expected, which is because of @@ -144,7 +147,6 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) } sgx_free_epc_page(epc_page); - return 0; } -- Gitee From caf79be089cac26dd6055cc9614857ff4b7fad38 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Oct 2021 16:11:55 -0400 Subject: [PATCH 057/134] x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE ioctl commit ae095b16fc652f459e6c16a256834985c85ecc4d upstream. For bare-metal SGX on real hardware, the hardware provides guarantees SGX state at reboot. For instance, all pages start out uninitialized. The vepc driver provides a similar guarantee today for freshly-opened vepc instances, but guests such as Windows expect all pages to be in uninitialized state on startup, including after every guest reboot. Some userspace implementations of virtual SGX would rather avoid having to close and reopen the /dev/sgx_vepc file descriptor and re-mmap the virtual EPC. For example, they could sandbox themselves after the guest starts and forbid further calls to open(), in order to mitigate exploits from untrusted guests. Therefore, add a ioctl that does this with EREMOVE. Userspace can invoke the ioctl to bring its vEPC pages back to uninitialized state. There is a possibility that some pages fail to be removed if they are SECS pages, and the child and SECS pages could be in separate vEPC regions. Therefore, the ioctl returns the number of EREMOVE failures, telling userspace to try the ioctl again after it's done with all vEPC regions. A more verbose description of the correct usage and the possible error conditions is documented in sgx.rst. Intel-SIG: commit ae095b16fc65 x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE ioctl. Backport for SGX virtualization support. Reviewed-by: Jarkko Sakkinen Reviewed-by: Dave Hansen Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20211021201155.1523989-3-pbonzini@redhat.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 35 ++++++++++++++++++++++ arch/x86/include/uapi/asm/sgx.h | 2 ++ arch/x86/kernel/cpu/sgx/virt.c | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index dd0ac96ff9ef..a608f667fb95 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -250,3 +250,38 @@ user wants to deploy SGX applications both on the host and in guests on the same machine, the user should reserve enough EPC (by taking out total virtual EPC size of all SGX VMs from the physical EPC size) for host SGX applications so they can run with acceptable performance. + +Architectural behavior is to restore all EPC pages to an uninitialized +state also after a guest reboot. Because this state can be reached only +through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc`` +provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction +on all pages in the virtual EPC. + +``EREMOVE`` can fail for three reasons. Userspace must pay attention +to expected failures and handle them as follows: + +1. Page removal will always fail when any thread is running in the + enclave to which the page belongs. In this case the ioctl will + return ``EBUSY`` independent of whether it has successfully removed + some pages; userspace can avoid these failures by preventing execution + of any vcpu which maps the virtual EPC. + +2. Page removal will cause a general protection fault if two calls to + ``EREMOVE`` happen concurrently for pages that refer to the same + "SECS" metadata pages. This can happen if there are concurrent + invocations to ``SGX_IOC_VEPC_REMOVE_ALL``, or if a ``/dev/sgx_vepc`` + file descriptor in the guest is closed at the same time as + ``SGX_IOC_VEPC_REMOVE_ALL``; it will also be reported as ``EBUSY``. + This can be avoided in userspace by serializing calls to the ioctl() + and to close(), but in general it should not be a problem. + +3. Finally, page removal will fail for SECS metadata pages which still + have child pages. Child pages can be removed by executing + ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors + mapped into the guest. This means that the ioctl() must be called + twice: an initial set of calls to remove child pages and a subsequent + set of calls to remove SECS pages. The second set of calls is only + required for those mappings that returned a nonzero value from the + first call. It indicates a bug in the kernel or the userspace client + if any of the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls has + a return code other than 0. diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 9034f3007c4e..95f2b40a55cd 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -27,6 +27,8 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) #define SGX_IOC_ENCLAVE_PROVISION \ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) +#define SGX_IOC_VEPC_REMOVE_ALL \ + _IO(SGX_MAGIC, 0x04) /** * struct sgx_enclave_create - parameter structure for the diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 09674c824ae9..b4f9a50de776 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -150,6 +150,41 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) return 0; } +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + static int sgx_vepc_release(struct inode *inode, struct file *file) { struct sgx_vepc *vepc = file->private_data; @@ -235,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + static const struct file_operations sgx_vepc_fops = { .owner = THIS_MODULE, .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, .release = sgx_vepc_release, .mmap = sgx_vepc_mmap, }; -- Gitee From 3b95ecef73a380bbb45dffcdf421c8674cf2bfba Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 11:29:04 -0800 Subject: [PATCH 058/134] x86/sgx: Fix free page accounting commit ac5d272a0ad0419f52e08c91953356e32b075af7 upstream. The SGX driver maintains a single global free page counter, sgx_nr_free_pages, that reflects the number of free pages available across all NUMA nodes. Correspondingly, a list of free pages is associated with each NUMA node and sgx_nr_free_pages is updated every time a page is added or removed from any of the free page lists. The main usage of sgx_nr_free_pages is by the reclaimer that runs when it (sgx_nr_free_pages) goes below a watermark to ensure that there are always some free pages available to, for example, support efficient page faults. With sgx_nr_free_pages accessed and modified from a few places it is essential to ensure that these accesses are done safely but this is not the case. sgx_nr_free_pages is read without any protection and updated with inconsistent protection by any one of the spin locks associated with the individual NUMA nodes. For example: CPU_A CPU_B ----- ----- spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); ... ... sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--; spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); Since sgx_nr_free_pages may be protected by different spin locks while being modified from different CPUs, the following scenario is possible: CPU_A CPU_B ----- ----- {sgx_nr_free_pages = 100} spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); sgx_nr_free_pages--; sgx_nr_free_pages--; /* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */ /* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */ /* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */ spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); In the above scenario, sgx_nr_free_pages is decremented from two CPUs but instead of sgx_nr_free_pages ending with a value that is two less than it started with, it was only decremented by one while the number of free pages were actually reduced by two. The consequence of sgx_nr_free_pages not being protected is that its value may not accurately reflect the actual number of free pages on the system, impacting the availability of free pages in support of many flows. The problematic scenario is when the reclaimer does not run because it believes there to be sufficient free pages while any attempt to allocate a page fails because there are no free pages available. In the SGX driver the reclaimer's watermark is only 32 pages so after encountering the above example scenario 32 times a user space hang is possible when there are no more free pages because of repeated page faults caused by no free pages made available. The following flow was encountered: asm_exc_page_fault ... sgx_vma_fault() sgx_encl_load_page() sgx_encl_eldu() // Encrypted page needs to be loaded from backing // storage into newly allocated SGX memory page sgx_alloc_epc_page() // Allocate a page of SGX memory __sgx_alloc_epc_page() // Fails, no free SGX memory ... if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer wake_up(&ksgxd_waitq); return -EBUSY; // Return -EBUSY giving reclaimer time to run return -EBUSY; return -EBUSY; return VM_FAULT_NOPAGE; The reclaimer is triggered in above flow with the following code: static bool sgx_should_reclaim(unsigned long watermark) { return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } In the problematic scenario there were no free pages available yet the value of sgx_nr_free_pages was above the watermark. The allocation of SGX memory thus always failed because of a lack of free pages while no free pages were made available because the reclaimer is never started because of sgx_nr_free_pages' incorrect value. The consequence was that user space kept encountering VM_FAULT_NOPAGE that caused the same address to be accessed repeatedly with the same result. Change the global free page counter to an atomic type that ensures simultaneous updates are done safely. While doing so, move the updating of the variable outside of the spin lock critical section to which it does not belong. Intel-SIG: commit ac5d272a0ad0 x86/sgx: Fix free page accounting Backport for SGX virtualization support. Cc: stable@vger.kernel.org Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()") Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 5527820fc8ec..f41573ac8b47 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); -/* The free page list lock protected variables prepend the lock. */ -static unsigned long sgx_nr_free_pages; +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; @@ -403,14 +402,15 @@ static void sgx_reclaim_pages(void) spin_lock(&node->lock); list_add_tail(&epc_page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - sgx_nr_free_pages--; spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); return page; } @@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, -- Gitee From 83696ea8ccc329e95a6d4e923cecfb8cb4bec33c Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 8 Feb 2022 10:48:07 -0800 Subject: [PATCH 059/134] x86/sgx: Silence softlockup detection when releasing large enclaves commit 8795359e35bc33bf86b6d0765aa7f37431db3b9c upstream. Vijay reported that the "unclobbered_vdso_oversubscribed" selftest triggers the softlockup detector. Actual SGX systems have 128GB of enclave memory or more. The "unclobbered_vdso_oversubscribed" selftest creates one enclave which consumes all of the enclave memory on the system. Tearing down such a large enclave takes around a minute, most of it in the loop where the EREMOVE instruction is applied to each individual 4k enclave page. Spending one minute in a loop triggers the softlockup detector. Add a cond_resched() to give other tasks a chance to run and placate the softlockup detector. Intel-SIG: commit 8795359e35bc x86/sgx: Silence softlockup detection when releasing large enclaves. Backport for SGX virtualization support. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Vijay Dhanraj Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Tested-by: Jarkko Sakkinen (kselftest as sanity check) Link: https://lkml.kernel.org/r/ced01cac1e75f900251b0a4ae1150aa8ebd295ec.1644345232.git.reinette.chatre@intel.com Signed-off-by: Fan Du [ Zhiquan Li: amend commit log and resolve the conflict. The issue has been fixed by commit d8c633308528 ("sgx: fix softlockup when sgx_encl_release"), just align the code to upstream here. ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index f61f946755a8..9f978d0cb506 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -409,8 +409,9 @@ void sgx_encl_release(struct kref *ref) entry->epc_page = NULL; } - cond_resched(); kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); } xa_destroy(&encl->page_array); -- Gitee From bc4638942446399c3bac4a5283d642d6ca563d4c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 19 May 2021 10:51:43 +0200 Subject: [PATCH 060/134] docs: virt: api.rst: fix a pointer to SGX documentation commit 0a5fab9f085880dbd7f9b0055c74936ca8b64fc1 upstream. The document which describes the SGX kernel architecture was added at commit 3fa97bf00126 ("Documentation/x86: Document SGX kernel architecture") but the reference at virt/kvm/api.rst is pointing to some non-existing document. Intel-SIG: commit 0a5fab9f0858 docs: virt: api.rst: fix a pointer to SGX documentation Backport for SGX virtualization support on Intel Xeon platform. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/138c24633c6e4edf862a2b4d77033c603fc10406.1621413933.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- Documentation/virt/kvm/api.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index 247df75b5552..f7d96f64d84f 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -5133,7 +5133,7 @@ system fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM prevents access to privileged attributes by default. -See Documentation/x86/sgx/2.Kernel-internals.rst for more details. +See Documentation/x86/sgx.rst for more details. 7.32 KVM_CAP_MAX_VCPU_ID ------------------------ -- Gitee From 3f73d9fa6743b6f92d25f178d3478972430d5873 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 20 Apr 2021 18:08:50 -0700 Subject: [PATCH 061/134] KVM: x86: Fix implicit enum conversion goof in scattered reverse CPUID code commit 462f8ddebccbb8a364b154008212052d515ac6b1 upstream. Take "enum kvm_only_cpuid_leafs" in scattered specific CPUID helpers (which is obvious in hindsight), and use "unsigned int" for leafs that can be the kernel's standard "enum cpuid_leaf" or the aforementioned KVM-only variant. Loss of the enum params is a bit disapponting, but gcc obviously isn't providing any extra sanity checks, and the various BUILD_BUG_ON() assertions ensure the input is in range. This fixes implicit enum conversions that are detected by clang-11: arch/x86/kvm/cpuid.c:499:29: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion] kvm_cpu_cap_init_scattered(CPUID_12_EAX, ~~~~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~ arch/x86/kvm/cpuid.c:837:31: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion] cpuid_entry_override(entry, CPUID_12_EAX); ~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~ 2 warnings generated. Intel-SIG: commit 462f8ddebccb KVM: x86: Fix implicit enum conversion goof in scattered reverse CPUID code. Backport for SGX virtualization support. Fixes: 4e66c0cb79b7 ("KVM: x86: Add support for reverse CPUID lookup of scattered features") Cc: Kai Huang Signed-off-by: Sean Christopherson Message-Id: <20210421010850.3009718-1-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kvm/cpuid.c | 5 +++-- arch/x86/kvm/cpuid.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 01de416474c7..97673552ee4b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -333,7 +333,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ -static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf) +static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) { const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; @@ -346,7 +346,8 @@ static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf) kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); } -static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask) +static __always_inline +void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) { /* Use kvm_cpu_cap_mask for non-scattered leafs. */ BUILD_BUG_ON(leaf < NCAPINTS); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 2281280a4259..093f913d55a1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -214,7 +214,7 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry, } static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, - enum cpuid_leafs leaf) + unsigned int leaf) { u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); -- Gitee From 131746456bd244a74f9c00b6a2929fe796f44032 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:44 -0700 Subject: [PATCH 062/134] x86/sgx: Add new sgx_epc_page flag bit to mark free pages commit d6d261bded8a57aed4faa12d08a5b193418d3aa4 upstream. SGX EPC pages go through the following life cycle: DIRTY ---> FREE ---> IN-USE --\ ^ | \-----------------/ Recovery action for poison for a DIRTY or FREE page is simple. Just make sure never to allocate the page. IN-USE pages need some extra handling. Add a new flag bit SGX_EPC_PAGE_IS_FREE that is set when a page is added to a free list and cleared when the page is allocated. Notes: 1) These transitions are made while holding the node->lock so that future code that checks the flags while holding the node->lock can be sure that if the SGX_EPC_PAGE_IS_FREE bit is set, then the page is on the free list. 2) Initially while the pages are on the dirty list the SGX_EPC_PAGE_IS_FREE bit is cleared. Intel-SIG: commit d6d261bded8a x86/sgx: Add new sgx_epc_page flag bit to mark free pages. Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-2-tony.luck@intel.com [ Zhiquan: amend commit log and adapt the code ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 2 ++ arch/x86/kernel/cpu/sgx/sgx.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index f41573ac8b47..7747cef7e9dd 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -471,6 +471,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); + page->flags = 0; spin_unlock(&node->lock); atomic_long_dec(&sgx_nr_free_pages); @@ -625,6 +626,7 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); atomic_long_inc(&sgx_nr_free_pages); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4628acec0009..5906471156c5 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -26,6 +26,9 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; unsigned int flags; -- Gitee From 4756f9c4e3deb7262cdda981e768bde867b6f2b5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:45 -0700 Subject: [PATCH 063/134] x86/sgx: Add infrastructure to identify SGX EPC pages commit 40e0e7843e23d164625b9031514f5672f8758bf4 upstream. X86 machine check architecture reports a physical address when there is a memory error. Handling that error requires a method to determine whether the physical address reported is in any of the areas reserved for EPC pages by BIOS. SGX EPC pages do not have Linux "struct page" associated with them. Keep track of the mapping from ranges of EPC pages to the sections that contain them using an xarray. N.B. adds CONFIG_XARRAY_MULTI to the SGX dependecies. So "select" that in arch/x86/Kconfig for X86/SGX. Create a function arch_is_platform_page() that simply reports whether an address is an EPC page for use elsewhere in the kernel. The ACPI error injection code needs this function and is typically built as a module, so export it. Note that arch_is_platform_page() will be slower than other similar "what type is this page" functions that can simply check bits in the "struct page". If there is some future performance critical user of this function it may need to be implemented in a more efficient way. Note also that the current implementation of xarray allocates a few hundred kilobytes for this usage on a system with 4GB of SGX EPC memory configured. This isn't ideal, but worth it for the code simplicity. Intel-SIG: commit 40e0e7843e23 x86/sgx: Add infrastructure to identify SGX EPC pages. Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-3-tony.luck@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/sgx/main.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9c069f5da31c..e9c1b1428ce0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2017,6 +2017,7 @@ config X86_SGX select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA + select XARRAY_MULTI help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 7747cef7e9dd..71930c0a5f88 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -20,6 +20,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed @@ -650,6 +651,8 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; @@ -661,6 +664,12 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the -- Gitee From fabe4ca71947a479123b27837d489e0d3f227cf7 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:46 -0700 Subject: [PATCH 064/134] x86/sgx: Initial poison handling for dirty and free pages commit 992801ae92431761b3d8ec88abd5793d154d34ac upstream. A memory controller patrol scrubber can report poison in a page that isn't currently being used. Add "poison" field in the sgx_epc_page that can be set for an sgx_epc_page. Check for it: 1) When sanitizing dirty pages 2) When freeing epc pages Poison is a new field separated from flags to avoid having to make all updates to flags atomic, or integrate poison state changes into some other locking scheme to protect flags (Currently just sgx_reclaimer_lock which protects the SGX_EPC_PAGE_RECLAIMER_TRACKED bit in page->flags). In both cases place the poisoned page on a per-node list of poisoned epc pages to make sure it will not be reallocated. Intel-SIG: commit 992801ae9243 x86/sgx: Initial poison handling for dirty and free pages. Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-4-tony.luck@intel.com [ Zhiquan: amend commit log and adapt the code ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 26 +++++++++++++++++++++++++- arch/x86/kernel/cpu/sgx/sgx.h | 4 +++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 71930c0a5f88..0c230ad410a3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -61,6 +61,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* @@ -626,7 +644,11 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); - list_add_tail(&page->list, &node->free_page_list); + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); @@ -658,6 +680,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; + section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } @@ -724,6 +747,7 @@ static bool __init sgx_page_cache_init(void) if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5906471156c5..9ec3136c7800 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -31,7 +31,8 @@ struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; @@ -42,6 +43,7 @@ struct sgx_epc_page { */ struct sgx_numa_node { struct list_head free_page_list; + struct list_head sgx_poison_page_list; spinlock_t lock; }; -- Gitee From 968c8d45d9143675fc81fbddf25d52be07c26b2b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:47 -0700 Subject: [PATCH 065/134] x86/sgx: Add SGX infrastructure to recover from poison commit a495cbdffa30558b34f3c95555cecc4fd9688039 upstream. Provide a recovery function sgx_memory_failure(). If the poison was consumed synchronously then send a SIGBUS. Note that the virtual address of the access is not included with the SIGBUS as is the case for poison outside of SGX enclaves. This doesn't matter as addresses of code/data inside an enclave is of little to no use to code executing outside the (now dead) enclave. Poison found in a free page results in the page being moved from the free list to the per-node poison page list. Intel-SIG: commit a495cbdffa30 x86/sgx: Add SGX infrastructure to recover from poison. Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-5-tony.luck@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 0c230ad410a3..89802fd3cbb8 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -693,6 +693,82 @@ bool arch_is_platform_page(u64 paddr) } EXPORT_SYMBOL_GPL(arch_is_platform_page); +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the -- Gitee From 97297db6d4530208d5623e41ac8f5c154805525c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:48 -0700 Subject: [PATCH 066/134] x86/sgx: Hook arch_memory_failure() into mainline code commit 03b122da74b22fbe7cd98184fa5657a9ce13970c upstream. Add a call inside memory_failure() to call the arch specific code to check if the address is an SGX EPC page and handle it. Note the SGX EPC pages do not have a "struct page" entry, so the hook goes in at the same point as the device mapping hook. Pull the call to acquire the mutex earlier so the SGX errors are also protected. Make set_mce_nospec() skip SGX pages when trying to adjust the 1:1 map. Intel-SIG: commit 03b122da74b2 x86/sgx: Hook arch_memory_failure() into mainline code. Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Reviewed-by: Naoya Horiguchi Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-6-tony.luck@intel.com [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/processor.h | 8 ++++++++ arch/x86/include/asm/set_memory.h | 4 ++++ include/linux/mm.h | 13 +++++++++++++ mm/memory-failure.c | 19 +++++++++++++------ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e125879d8b3e..888b6c0a45a2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -932,4 +932,12 @@ enum taa_mitigations { TAA_MITIGATION_TSX_DISABLED, }; +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 162128cdfbf2..93bcd9a83574 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H +#include #include #include @@ -96,6 +97,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap) unsigned long decoy_addr; int rc; + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); diff --git a/include/linux/mm.h b/include/linux/mm.h index cafada5bd593..a1d74d4245a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2875,6 +2875,19 @@ extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(struct page *page, int flags); +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif /* * Error handlers for various types of pages. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e715dd67ce05..73ebf40fc3c9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1420,21 +1420,28 @@ int memory_failure(unsigned long pfn, int flags) if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); + mutex_lock(&mf_mutex); + p = pfn_to_online_page(pfn); if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, - pgmap); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); - return -ENXIO; + res = -ENXIO; + goto unlock_mutex; } - mutex_lock(&mf_mutex); - if (PageHuge(p)) { res = memory_failure_hugetlb(pfn, flags); goto unlock_mutex; -- Gitee From cd5909e9ef211a84295d2c790e05807da49093b9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:49 -0700 Subject: [PATCH 067/134] x86/sgx: Add hook to error injection address validation commit c6acb1e7bf4656b9434335c72b8245cc84575fde upstream. SGX reserved memory does not appear in the standard address maps. Add hook to call into the SGX code to check if an address is located in SGX memory. There are other challenges in injecting errors into SGX. Update the documentation with a sequence of operations to inject. Intel-SIG: commit c6acb1e7bf46 x86/sgx: Add hook to error injection address validation. Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-7-tony.luck@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- .../firmware-guide/acpi/apei/einj.rst | 19 +++++++++++++++++++ drivers/acpi/apei/einj.c | 3 ++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/firmware-guide/acpi/apei/einj.rst b/Documentation/firmware-guide/acpi/apei/einj.rst index e588bccf5158..2fa989c0a12d 100644 --- a/Documentation/firmware-guide/acpi/apei/einj.rst +++ b/Documentation/firmware-guide/acpi/apei/einj.rst @@ -181,5 +181,24 @@ You should see something like this in dmesg:: [22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0 [22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0) +Special notes for injection into SGX enclaves: + +There may be a separate BIOS setup option to enable SGX injection. + +The injection process consists of setting some special memory controller +trigger that will inject the error on the next write to the target +address. But the h/w prevents any software outside of an SGX enclave +from accessing enclave pages (even BIOS SMM mode). + +The following sequence can be used: + 1) Determine physical address of enclave page + 2) Use "notrigger=1" mode to inject (this will setup + the injection address, but will not actually inject) + 3) Enter the enclave + 4) Store data to the virtual address matching physical address from step 1 + 5) Execute CLFLUSH for that virtual address + 6) Spin delay for 250ms + 7) Read from the virtual address. This will trigger the error + For more information about EINJ, please refer to ACPI specification version 4.0, section 17.5 and ACPI 5.0, section 18.6. diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index e430cf4caec2..b2948f1f0d58 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -544,7 +544,8 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) != REGION_INTERSECTS) && (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY) - != REGION_INTERSECTS))) + != REGION_INTERSECTS) && + !arch_is_platform_page(base_addr))) return -EINVAL; inject: -- Gitee From 33adfddadf3a86aed1ddf61d3bb2de1187414b0a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:50 -0700 Subject: [PATCH 068/134] x86/sgx: Add check for SGX pages to ghes_do_memory_failure() commit 3ad6fd77a2d62e8f4465b429b65805eaf88e1b9e upstream. SGX EPC pages do not have a "struct page" associated with them so the pfn_valid() sanity check fails and results in a warning message to the console. Add an additional check to skip the warning if the address of the error is in an SGX EPC page. Intel-SIG: commit 3ad6fd77a2d6 x86/sgx: Add check for SGX pages to ghes_do_memory_failure(). Backport for SGX MCA recovery co-existence support. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-8-tony.luck@intel.com [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- drivers/acpi/apei/ghes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 4adf9ef2b0e2..90b867714403 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -450,7 +450,7 @@ static bool ghes_handle_memory_failure(struct ghes *ghes, return false; pfn = mem_err->physical_addr >> PAGE_SHIFT; - if (!pfn_valid(pfn)) { + if (!pfn_valid(pfn) && !arch_is_platform_page(mem_err->physical_addr)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid address in generic error data: %#llx\n", mem_err->physical_addr); -- Gitee From 2c52a99be91187b02bc72cc284249addce8a196a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 15 Nov 2021 10:35:14 -0800 Subject: [PATCH 069/134] selftests/sgx: Fix a benign linker warning commit 5064343fb155487362708bacc8c6ab9dc2c52bb8 upstream. The enclave binary (test_encl.elf) is built with only three sections (tcs, text, and data) as controlled by its custom linker script. If gcc is built with "--enable-linker-build-id" (this appears to be a common configuration even if it is by default off) then gcc will pass "--build-id" to the linker that will prompt it (the linker) to write unique bits identifying the linked file to a ".note.gnu.build-id" section. The section ".note.gnu.build-id" does not exist in the test enclave resulting in the following warning emitted by the linker: /usr/bin/ld: warning: .note.gnu.build-id section discarded, --build-id ignored The test enclave does not use the build id within the binary so fix the warning by passing a build id of "none" to the linker that will disable the setting from any earlier "--build-id" options and thus disable the attempt to write the build id to a ".note.gnu.build-id" section that does not exist. Intel-SIG: commit 5064343fb155 selftests/sgx: Fix a benign linker warning. Backport for SGX EDMM support. Link: https://lore.kernel.org/linux-sgx/20191017030340.18301-2-sean.j.christopherson@intel.com/ Suggested-by: Cedric Xing Signed-off-by: Sean Christopherson Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/ca0f8a81fc1e78af9bdbc6a88e0f9c37d82e53f2.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 7f12d55b97f8..2956584e1e37 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -45,7 +45,7 @@ $(OUTPUT)/sign_key.o: sign_key.S $(CC) $(HOST_CFLAGS) -c $< -o $@ $(OUTPUT)/test_encl.elf: test_encl.lds test_encl.c test_encl_bootstrap.S - $(CC) $(ENCL_CFLAGS) -T $^ -o $@ + $(CC) $(ENCL_CFLAGS) -T $^ -o $@ -Wl,--build-id=none EXTRA_CLEAN := \ $(OUTPUT)/test_encl.elf \ -- Gitee From 4cb0bfe6409bdd5fcc934f28bbffe36b8566173a Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:15 -0800 Subject: [PATCH 070/134] selftests/sgx: Assign source for each segment commit 39f62536be2f6160bba7294b5208e240d34703c3 upstream. Define source per segment so that enclave pages can be added from different sources, e.g. anonymous VMA for zero pages. In other words, add 'src' field to struct encl_segment, and assign it to 'encl->src' for pages inherited from the enclave binary. Intel-SIG: commit 39f62536be2f selftests/sgx: Assign source for each segment. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/7850709c3089fe20e4bcecb8295ba87c54cc2b4a.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 5 +++-- tools/testing/selftests/sgx/main.h | 1 + tools/testing/selftests/sgx/sigstruct.c | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 3ebe5d1fe337..5605474aab73 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -107,7 +107,7 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) memset(&secinfo, 0, sizeof(secinfo)); secinfo.flags = seg->flags; - ioc.src = (uint64_t)encl->src + seg->offset; + ioc.src = (uint64_t)seg->src; ioc.offset = seg->offset; ioc.length = seg->size; ioc.secinfo = (unsigned long)&secinfo; @@ -216,6 +216,7 @@ bool encl_load(const char *path, struct encl *encl) if (j == 0) { src_offset = phdr->p_offset & PAGE_MASK; + encl->src = encl->bin + src_offset; seg->prot = PROT_READ | PROT_WRITE; seg->flags = SGX_PAGE_TYPE_TCS << 8; @@ -228,13 +229,13 @@ bool encl_load(const char *path, struct encl *encl) seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset; seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK; + seg->src = encl->src + seg->offset; j++; } assert(j == encl->nr_segments); - encl->src = encl->bin + src_offset; encl->src_size = encl->segment_tbl[j - 1].offset + encl->segment_tbl[j - 1].size; diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index 68672fd86cf9..452d11dc4889 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -7,6 +7,7 @@ #define MAIN_H struct encl_segment { + void *src; off_t offset; size_t size; unsigned int prot; diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index 92bbc5a15c39..202a96fd81bf 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -289,14 +289,14 @@ static bool mrenclave_eextend(EVP_MD_CTX *ctx, uint64_t offset, static bool mrenclave_segment(EVP_MD_CTX *ctx, struct encl *encl, struct encl_segment *seg) { - uint64_t end = seg->offset + seg->size; + uint64_t end = seg->size; uint64_t offset; - for (offset = seg->offset; offset < end; offset += PAGE_SIZE) { - if (!mrenclave_eadd(ctx, offset, seg->flags)) + for (offset = 0; offset < end; offset += PAGE_SIZE) { + if (!mrenclave_eadd(ctx, seg->offset + offset, seg->flags)) return false; - if (!mrenclave_eextend(ctx, offset, encl->src + offset)) + if (!mrenclave_eextend(ctx, seg->offset + offset, seg->src + offset)) return false; } -- Gitee From 4d032bf3476c054f50d95f9dd413003a68a70847 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:16 -0800 Subject: [PATCH 071/134] selftests/sgx: Make data measurement for an enclave segment optional commit 5f0ce664d8c6c160ce4333e809545a8a57fe2baf upstream. For a heap makes sense to leave its contents "unmeasured" in the SGX enclave build process, meaning that they won't contribute to the cryptographic signature (a RSA-3072 signed SHA56 hash) of the enclave. Enclaves are signed blobs where the signature is calculated both from page data and also from "structural properties" of the pages. For instance a page offset of *every* page added to the enclave is hashed. For data, this is optional, not least because hashing a page has a significant contribution to the enclave load time. Thus, where there is no reason to hash, do not. The SGX ioctl interface supports this with SGX_PAGE_MEASURE flag. Only when the flag is *set*, data is measured. Add seg->measure boolean flag to struct encl_segment. Only when the flag is set, include the segment data to the signature (represented by SIGSTRUCT architectural structure). Intel-SIG: commit 5f0ce664d8c6 selftests/sgx: Make data measurement for an enclave segment optional. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/625b6fe28fed76275e9238ec4e15ec3c0d87de81.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 6 +++++- tools/testing/selftests/sgx/main.h | 1 + tools/testing/selftests/sgx/sigstruct.c | 6 ++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 5605474aab73..f1be78984c50 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -111,7 +111,10 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) ioc.offset = seg->offset; ioc.length = seg->size; ioc.secinfo = (unsigned long)&secinfo; - ioc.flags = SGX_PAGE_MEASURE; + if (seg->measure) + ioc.flags = SGX_PAGE_MEASURE; + else + ioc.flags = 0; rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc); if (rc < 0) { @@ -230,6 +233,7 @@ bool encl_load(const char *path, struct encl *encl) seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset; seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK; seg->src = encl->src + seg->offset; + seg->measure = true; j++; } diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index 452d11dc4889..aebc69e7cdc8 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -12,6 +12,7 @@ struct encl_segment { size_t size; unsigned int prot; unsigned int flags; + bool measure; }; struct encl { diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index 202a96fd81bf..50c5ab1aa6fa 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -296,8 +296,10 @@ static bool mrenclave_segment(EVP_MD_CTX *ctx, struct encl *encl, if (!mrenclave_eadd(ctx, seg->offset + offset, seg->flags)) return false; - if (!mrenclave_eextend(ctx, seg->offset + offset, seg->src + offset)) - return false; + if (seg->measure) { + if (!mrenclave_eextend(ctx, seg->offset + offset, seg->src + offset)) + return false; + } } return true; -- Gitee From baa4e10dd876757f55964c105f609566f6842f80 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:17 -0800 Subject: [PATCH 072/134] selftests/sgx: Create a heap for the test enclave commit 3200505d4de6436af799d7be743d9dc87450ee5a upstream. Create a heap for the test enclave, which is allocated from /dev/null, and left unmeasured. This is beneficial by its own because it verifies that an enclave built from multiple choices, works properly. If LSM hooks are added for SGX some day, a multi source enclave has higher probability to trigger bugs on access control checks. The immediate need comes from the need to implement page reclaim tests. In order to trigger the page reclaimer, one can just set the size of the heap to high enough. Intel-SIG: commit 3200505d4de6 selftests/sgx: Create a heap for the test enclave. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/e070c5f23578c29608051cab879b1d276963a27a.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 29 ++++++++++++++++++++++------- tools/testing/selftests/sgx/main.c | 2 +- tools/testing/selftests/sgx/main.h | 4 +++- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index f1be78984c50..9d4322c946e2 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -21,6 +21,8 @@ void encl_delete(struct encl *encl) { + struct encl_segment *heap_seg = &encl->segment_tbl[encl->nr_segments - 1]; + if (encl->encl_base) munmap((void *)encl->encl_base, encl->encl_size); @@ -30,6 +32,8 @@ void encl_delete(struct encl *encl) if (encl->fd) close(encl->fd); + munmap(heap_seg->src, heap_seg->size); + if (encl->segment_tbl) free(encl->segment_tbl); @@ -125,11 +129,10 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) return true; } - - -bool encl_load(const char *path, struct encl *encl) +bool encl_load(const char *path, struct encl *encl, unsigned long heap_size) { const char device_path[] = "/dev/sgx_enclave"; + struct encl_segment *seg; Elf64_Phdr *phdr_tbl; off_t src_offset; Elf64_Ehdr *ehdr; @@ -181,6 +184,8 @@ bool encl_load(const char *path, struct encl *encl) ehdr = encl->bin; phdr_tbl = encl->bin + ehdr->e_phoff; + encl->nr_segments = 1; /* one for the heap */ + for (i = 0; i < ehdr->e_phnum; i++) { Elf64_Phdr *phdr = &phdr_tbl[i]; @@ -196,7 +201,6 @@ bool encl_load(const char *path, struct encl *encl) for (i = 0, j = 0; i < ehdr->e_phnum; i++) { Elf64_Phdr *phdr = &phdr_tbl[i]; unsigned int flags = phdr->p_flags; - struct encl_segment *seg; if (phdr->p_type != PT_LOAD) continue; @@ -238,10 +242,21 @@ bool encl_load(const char *path, struct encl *encl) j++; } - assert(j == encl->nr_segments); + assert(j == encl->nr_segments - 1); + + seg = &encl->segment_tbl[j]; + seg->offset = encl->segment_tbl[j - 1].offset + encl->segment_tbl[j - 1].size; + seg->size = heap_size; + seg->src = mmap(NULL, heap_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + seg->prot = PROT_READ | PROT_WRITE; + seg->flags = (SGX_PAGE_TYPE_REG << 8) | seg->prot; + seg->measure = false; + + if (seg->src == MAP_FAILED) + goto err; - encl->src_size = encl->segment_tbl[j - 1].offset + - encl->segment_tbl[j - 1].size; + encl->src_size = encl->segment_tbl[j].offset + encl->segment_tbl[j].size; for (encl->encl_size = 4096; encl->encl_size < encl->src_size; ) encl->encl_size <<= 1; diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index e252015e0c15..6858a35fed20 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -122,7 +122,7 @@ FIXTURE_SETUP(enclave) unsigned int i; void *addr; - if (!encl_load("test_encl.elf", &self->encl)) { + if (!encl_load("test_encl.elf", &self->encl, ENCL_HEAP_SIZE_DEFAULT)) { encl_delete(&self->encl); ksft_exit_skip("cannot load enclaves\n"); } diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index aebc69e7cdc8..b45c52ec7ab3 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -6,6 +6,8 @@ #ifndef MAIN_H #define MAIN_H +#define ENCL_HEAP_SIZE_DEFAULT 4096 + struct encl_segment { void *src; off_t offset; @@ -33,7 +35,7 @@ extern unsigned char sign_key[]; extern unsigned char sign_key_end[]; void encl_delete(struct encl *ctx); -bool encl_load(const char *path, struct encl *encl); +bool encl_load(const char *path, struct encl *encl, unsigned long heap_size); bool encl_measure(struct encl *encl); bool encl_build(struct encl *encl); -- Gitee From 53d659114c83557daa3edf9d3df164a54edb78d8 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:18 -0800 Subject: [PATCH 073/134] selftests/sgx: Dump segments and /proc/self/maps only on failure commit 1471721489090515f9f0f059b25124898928e559 upstream. Logging is always a compromise between clarity and detail. The main use case for dumping VMA's is when FIXTURE_SETUP() fails, and is less important for enclaves that do initialize correctly. Therefore, print the segments and /proc/self/maps only in the error case. Finally, if a single test ever creates multiple enclaves, the amount of log lines would become enormous. Intel-SIG: commit 147172148909 selftests/sgx: Dump segments and /proc/self/maps only on failure. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/23cef0ae1de3a8a74cbfbbe74eca48ca3f300fde.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 6858a35fed20..deab02f2f3ce 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -127,12 +127,6 @@ FIXTURE_SETUP(enclave) ksft_exit_skip("cannot load enclaves\n"); } - for (i = 0; i < self->encl.nr_segments; i++) { - seg = &self->encl.segment_tbl[i]; - - TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot); - } - if (!encl_measure(&self->encl)) goto err; @@ -169,6 +163,17 @@ FIXTURE_SETUP(enclave) memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; + return; + +err: + encl_delete(&self->encl); + + for (i = 0; i < self->encl.nr_segments; i++) { + seg = &self->encl.segment_tbl[i]; + + TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot); + } + maps_file = fopen("/proc/self/maps", "r"); if (maps_file != NULL) { while (fgets(maps_line, sizeof(maps_line), maps_file) != NULL) { @@ -181,11 +186,7 @@ FIXTURE_SETUP(enclave) fclose(maps_file); } -err: - if (!sgx_enter_enclave_sym) - encl_delete(&self->encl); - - ASSERT_NE(sgx_enter_enclave_sym, NULL); + ASSERT_TRUE(false); } FIXTURE_TEARDOWN(enclave) -- Gitee From 2b60bc7c18ada247446b22d7470e6c657fd6c893 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:19 -0800 Subject: [PATCH 074/134] selftests/sgx: Encpsulate the test enclave creation commit 1b35eb719549ab5143d61f9e09b0771cd3d00d94 upstream. Introduce setup_test_encl() so that the enclave creation can be moved to TEST_F()'s. This is required for a reclaimer test where the heap size needs to be set large enough to triger the page reclaimer. Intel-SIG: commit 1b35eb719549 selftests/sgx: Encpsulate the test enclave creation. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/bee0ca867a95828a569c1ba2a8e443a44047dc71.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 44 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index deab02f2f3ce..5b3e49a36344 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -112,7 +112,8 @@ FIXTURE(enclave) { struct sgx_enclave_run run; }; -FIXTURE_SETUP(enclave) +static bool setup_test_encl(unsigned long heap_size, struct encl *encl, + struct __test_metadata *_metadata) { Elf64_Sym *sgx_enter_enclave_sym = NULL; struct vdso_symtab symtab; @@ -122,25 +123,25 @@ FIXTURE_SETUP(enclave) unsigned int i; void *addr; - if (!encl_load("test_encl.elf", &self->encl, ENCL_HEAP_SIZE_DEFAULT)) { - encl_delete(&self->encl); - ksft_exit_skip("cannot load enclaves\n"); + if (!encl_load("test_encl.elf", encl, heap_size)) { + encl_delete(encl); + TH_LOG("Failed to load the test enclave.\n"); } - if (!encl_measure(&self->encl)) + if (!encl_measure(encl)) goto err; - if (!encl_build(&self->encl)) + if (!encl_build(encl)) goto err; /* * An enclave consumer only must do this. */ - for (i = 0; i < self->encl.nr_segments; i++) { - struct encl_segment *seg = &self->encl.segment_tbl[i]; + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; - addr = mmap((void *)self->encl.encl_base + seg->offset, seg->size, - seg->prot, MAP_SHARED | MAP_FIXED, self->encl.fd, 0); + addr = mmap((void *)encl->encl_base + seg->offset, seg->size, + seg->prot, MAP_SHARED | MAP_FIXED, encl->fd, 0); EXPECT_NE(addr, MAP_FAILED); if (addr == MAP_FAILED) goto err; @@ -160,16 +161,13 @@ FIXTURE_SETUP(enclave) vdso_sgx_enter_enclave = addr + sgx_enter_enclave_sym->st_value; - memset(&self->run, 0, sizeof(self->run)); - self->run.tcs = self->encl.encl_base; - - return; + return true; err: - encl_delete(&self->encl); + encl_delete(encl); - for (i = 0; i < self->encl.nr_segments; i++) { - seg = &self->encl.segment_tbl[i]; + for (i = 0; i < encl->nr_segments; i++) { + seg = &encl->segment_tbl[i]; TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot); } @@ -186,7 +184,17 @@ FIXTURE_SETUP(enclave) fclose(maps_file); } - ASSERT_TRUE(false); + TH_LOG("Failed to initialize the test enclave.\n"); + + return false; +} + +FIXTURE_SETUP(enclave) +{ + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; } FIXTURE_TEARDOWN(enclave) -- Gitee From abd31055591a59329112b868e5fd2986362f9618 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:20 -0800 Subject: [PATCH 075/134] selftests/sgx: Move setup_test_encl() to each TEST_F() commit 065825db1fd60aa7695565613a69ed086a831869 upstream. Create the test enclave inside each TEST_F(), instead of FIXTURE_SETUP(), so that the heap size can be defined per test. Intel-SIG: commit 065825db1fd6 selftests/sgx: Move setup_test_encl() to each TEST_F(). Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/70ca264535d2ca0dc8dcaf2281e7d6965f8d4a24.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 5b3e49a36344..f41fba919d06 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -191,10 +191,6 @@ static bool setup_test_encl(unsigned long heap_size, struct encl *encl, FIXTURE_SETUP(enclave) { - ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); - - memset(&self->run, 0, sizeof(self->run)); - self->run.tcs = self->encl.encl_base; } FIXTURE_TEARDOWN(enclave) @@ -226,6 +222,11 @@ TEST_F(enclave, unclobbered_vdso) { struct encl_op op; + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + op.type = ENCL_OP_PUT; op.buffer = MAGIC; @@ -248,6 +249,11 @@ TEST_F(enclave, clobbered_vdso) { struct encl_op op; + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + op.type = ENCL_OP_PUT; op.buffer = MAGIC; @@ -278,6 +284,11 @@ TEST_F(enclave, clobbered_vdso_and_user_function) { struct encl_op op; + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + self->run.user_handler = (__u64)test_handler; self->run.user_data = 0xdeadbeef; -- Gitee From 18f812d14154425fd2523b411177881a1df4b9e1 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:21 -0800 Subject: [PATCH 076/134] selftests/sgx: Add a new kselftest: Unclobbered_vdso_oversubscribed commit f0ff2447b8613b883f41ae845b6cc7540d6e5f71 upstream. Add a variation of the unclobbered_vdso test. In the new test, create a heap for the test enclave, which has the same size as all available Enclave Page Cache (EPC) pages in the system. This will guarantee that all test_encl.elf pages *and* SGX Enclave Control Structure (SECS) have been swapped out by the page reclaimer during the load time. This test will trigger both the page reclaimer and the page fault handler. The page reclaimer triggered, while the heap is being created during the load time. The page fault handler is triggered for all the required pages, while the test case is executing. Intel-SIG: commit f0ff2447b861 selftests/sgx: Add a new kselftest: Unclobbered_vdso_oversubscribed. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/41f7c508eea79a3198b5014d7691903be08f9ff1.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 75 ++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index f41fba919d06..ee8139a22a3c 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -245,6 +245,81 @@ TEST_F(enclave, unclobbered_vdso) EXPECT_EQ(self->run.user_data, 0); } +/* + * A section metric is concatenated in a way that @low bits 12-31 define the + * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the + * metric. + */ +static unsigned long sgx_calc_section_metric(unsigned int low, + unsigned int high) +{ + return (low & GENMASK_ULL(31, 12)) + + ((high & GENMASK_ULL(19, 0)) << 32); +} + +/* + * Sum total available physical SGX memory across all EPC sections + * + * Return: total available physical SGX memory available on system + */ +static unsigned long get_total_epc_mem(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned long total_size = 0; + unsigned int type; + int section = 0; + + while (true) { + eax = SGX_CPUID; + ecx = section + SGX_CPUID_EPC; + __cpuid(&eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) + break; + + if (type != SGX_CPUID_EPC_SECTION) + break; + + total_size += sgx_calc_section_metric(ecx, edx); + + section++; + } + + return total_size; +} + +TEST_F(enclave, unclobbered_vdso_oversubscribed) +{ + unsigned long total_mem; + struct encl_op op; + + total_mem = get_total_epc_mem(); + ASSERT_NE(total_mem, 0); + ASSERT_TRUE(setup_test_encl(total_mem, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + op.type = ENCL_OP_PUT; + op.buffer = MAGIC; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + op.type = ENCL_OP_GET; + op.buffer = 0; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + + EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + +} + TEST_F(enclave, clobbered_vdso) { struct encl_op op; -- Gitee From bf88d59fce8375476e4109d411c569c812514c6a Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Sat, 4 Dec 2021 22:23:55 +0200 Subject: [PATCH 077/134] selftests/sgx: Fix corrupted cpuid macro invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 572a0a647b9b491729d24c083c8410c55bf16326 upstream. The SGX selftest fails to build on tip/x86/sgx: main.c: In function ‘get_total_epc_mem’: main.c:296:17: error: implicit declaration of function ‘__cpuid’ [-Werror=implicit-function-declaration] 296 | __cpuid(&eax, &ebx, &ecx, &edx); | ^~~~~~~ Include cpuid.h and use __cpuid_count() macro in order to fix the compilation issue. [ dhansen: tweak commit message ] Intel-SIG: commit 572a0a647b9b selftests/sgx: Fix corrupted cpuid macro invocation. Backport for SGX EDMM support. Fixes: f0ff2447b861 ("selftests/sgx: Add a new kselftest: Unclobbered_vdso_oversubscribed") Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Acked-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211204202355.23005-1-jarkko@kernel.org Cc: Shuah Khan [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index ee8139a22a3c..1ddd513a8cff 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include #include #include #include @@ -270,9 +271,7 @@ static unsigned long get_total_epc_mem(void) int section = 0; while (true) { - eax = SGX_CPUID; - ecx = section + SGX_CPUID_EPC; - __cpuid(&eax, &ebx, &ecx, &edx); + __cpuid_count(SGX_CPUID, section + SGX_CPUID_EPC, eax, ebx, ecx, edx); type = eax & SGX_CPUID_EPC_MASK; if (type == SGX_CPUID_EPC_INVALID) -- Gitee From b35653ebf0cda12418a1a6e57187141573dec347 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 15 Nov 2021 10:35:22 -0800 Subject: [PATCH 078/134] selftests/sgx: Provide per-op parameter structs for the test enclave commit 41493a095e487c207b4b702aee2f8c59a7294e4f upstream. To add more operations to the test enclave, the protocol needs to allow to have operations with varying parameters. Create a separate parameter struct for each existing operation, with the shared parameters in struct encl_op_header. Intel-SIG: commit 41493a095e48 selftests/sgx: Provide per-op parameter structs for the test enclave. Backport for SGX EDMM support. [reinette: rebased to apply on top of oversubscription test series] Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/f9a4a8c436b538003b8ebddaa66083992053cef1.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 14 ++++- tools/testing/selftests/sgx/main.c | 68 +++++++++++++------------ tools/testing/selftests/sgx/test_encl.c | 33 +++++++----- 3 files changed, 69 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index f88562afcaa0..6ff95a766287 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -21,11 +21,21 @@ enum encl_op_type { ENCL_OP_PUT, ENCL_OP_GET, + ENCL_OP_MAX, }; -struct encl_op { +struct encl_op_header { uint64_t type; - uint64_t buffer; +}; + +struct encl_op_put { + struct encl_op_header header; + uint64_t value; +}; + +struct encl_op_get { + struct encl_op_header header; + uint64_t value; }; #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 1ddd513a8cff..f8bf592c5074 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -221,27 +221,28 @@ FIXTURE_TEARDOWN(enclave) TEST_F(enclave, unclobbered_vdso) { - struct encl_op op; + struct encl_op_put put_op; + struct encl_op_get get_op; ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; - op.type = ENCL_OP_PUT; - op.buffer = MAGIC; + put_op.header.type = ENCL_OP_PUT; + put_op.value = MAGIC; - EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - op.type = ENCL_OP_GET; - op.buffer = 0; + get_op.header.type = ENCL_OP_GET; + get_op.value = 0; - EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); - EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EQ(get_op.value, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } @@ -291,7 +292,8 @@ static unsigned long get_total_epc_mem(void) TEST_F(enclave, unclobbered_vdso_oversubscribed) { unsigned long total_mem; - struct encl_op op; + struct encl_op_put put_op; + struct encl_op_get get_op; total_mem = get_total_epc_mem(); ASSERT_NE(total_mem, 0); @@ -300,20 +302,20 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; - op.type = ENCL_OP_PUT; - op.buffer = MAGIC; + put_op.header.type = ENCL_OP_PUT; + put_op.value = MAGIC; - EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - op.type = ENCL_OP_GET; - op.buffer = 0; + get_op.header.type = ENCL_OP_GET; + get_op.value = 0; - EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0); + EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); - EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EQ(get_op.value, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); @@ -321,27 +323,28 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) TEST_F(enclave, clobbered_vdso) { - struct encl_op op; + struct encl_op_put put_op; + struct encl_op_get get_op; ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; - op.type = ENCL_OP_PUT; - op.buffer = MAGIC; + put_op.header.type = ENCL_OP_PUT; + put_op.value = MAGIC; - EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - op.type = ENCL_OP_GET; - op.buffer = 0; + get_op.header.type = ENCL_OP_GET; + get_op.value = 0; - EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0); - EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EQ(get_op.value, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } @@ -356,7 +359,8 @@ static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r TEST_F(enclave, clobbered_vdso_and_user_function) { - struct encl_op op; + struct encl_op_put put_op; + struct encl_op_get get_op; ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); @@ -366,20 +370,20 @@ TEST_F(enclave, clobbered_vdso_and_user_function) self->run.user_handler = (__u64)test_handler; self->run.user_data = 0xdeadbeef; - op.type = ENCL_OP_PUT; - op.buffer = MAGIC; + put_op.header.type = ENCL_OP_PUT; + put_op.value = MAGIC; - EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - op.type = ENCL_OP_GET; - op.buffer = 0; + get_op.header.type = ENCL_OP_GET; + get_op.value = 0; - EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0); - EXPECT_EQ(op.buffer, MAGIC); + EXPECT_EQ(get_op.value, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); } diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 734ea52f9924..f11eb8315704 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -16,20 +16,29 @@ static void *memcpy(void *dest, const void *src, size_t n) return dest; } -void encl_body(void *rdi, void *rsi) +static void do_encl_op_put(void *op) +{ + struct encl_op_put *op2 = op; + + memcpy(&encl_buffer[0], &op2->value, 8); +} + +static void do_encl_op_get(void *op) { - struct encl_op *op = (struct encl_op *)rdi; + struct encl_op_get *op2 = op; - switch (op->type) { - case ENCL_OP_PUT: - memcpy(&encl_buffer[0], &op->buffer, 8); - break; + memcpy(&op2->value, &encl_buffer[0], 8); +} + +void encl_body(void *rdi, void *rsi) +{ + const void (*encl_op_array[ENCL_OP_MAX])(void *) = { + do_encl_op_put, + do_encl_op_get, + }; - case ENCL_OP_GET: - memcpy(&op->buffer, &encl_buffer[0], 8); - break; + struct encl_op_header *op = (struct encl_op_header *)rdi; - default: - break; - } + if (op->type < ENCL_OP_MAX) + (*encl_op_array[op->type])(op); } -- Gitee From 084e7b9e5b1841842c931327a2e32fce7385d0a8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 10:35:23 -0800 Subject: [PATCH 079/134] selftests/sgx: Rename test properties in preparation for more enclave tests commit c085dfc7685c8c36698b851b03990b75a3226e97 upstream. SGX selftests prepares a data structure outside of the enclave with the type of and data for the operation that needs to be run within the enclave. At this time only two complementary operations are supported by the enclave: copying a value from outside the enclave into a default buffer within the enclave and reading a value from the enclave's default buffer into a variable accessible outside the enclave. In preparation for more operations supported by the enclave the names of the current enclave operations are changed to more accurately reflect the operations and more easily distinguish it from future operations: * The enums ENCL_OP_PUT and ENCL_OP_GET are renamed to ENCL_OP_PUT_TO_BUFFER and ENCL_OP_GET_FROM_BUFFER respectively. * The structs encl_op_put and encl_op_get are renamed to encl_op_put_to_buf and encl_op_get_from_buf respectively. * The enclave functions do_encl_op_put and do_encl_op_get are renamed to do_encl_op_put_to_buf and do_encl_op_get_from_buf respectively. No functional changes. Intel-SIG: commit c085dfc7685c selftests/sgx: Rename test properties in preparation for more enclave tests. Backport for SGX EDMM support. Suggested-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/023fda047c787cf330b88ed9337705edae6a0078.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 8 +++---- tools/testing/selftests/sgx/main.c | 32 ++++++++++++------------- tools/testing/selftests/sgx/test_encl.c | 12 +++++----- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 6ff95a766287..9ea0c7882dfb 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -19,8 +19,8 @@ #include "../../../../arch/x86/include/uapi/asm/sgx.h" enum encl_op_type { - ENCL_OP_PUT, - ENCL_OP_GET, + ENCL_OP_PUT_TO_BUFFER, + ENCL_OP_GET_FROM_BUFFER, ENCL_OP_MAX, }; @@ -28,12 +28,12 @@ struct encl_op_header { uint64_t type; }; -struct encl_op_put { +struct encl_op_put_to_buf { struct encl_op_header header; uint64_t value; }; -struct encl_op_get { +struct encl_op_get_from_buf { struct encl_op_header header; uint64_t value; }; diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index f8bf592c5074..1ad64fb4cb0d 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -221,15 +221,15 @@ FIXTURE_TEARDOWN(enclave) TEST_F(enclave, unclobbered_vdso) { - struct encl_op_put put_op; - struct encl_op_get get_op; + struct encl_op_get_from_buf get_op; + struct encl_op_put_to_buf put_op; ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; - put_op.header.type = ENCL_OP_PUT; + put_op.header.type = ENCL_OP_PUT_TO_BUFFER; put_op.value = MAGIC; EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); @@ -237,7 +237,7 @@ TEST_F(enclave, unclobbered_vdso) EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - get_op.header.type = ENCL_OP_GET; + get_op.header.type = ENCL_OP_GET_FROM_BUFFER; get_op.value = 0; EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); @@ -291,9 +291,9 @@ static unsigned long get_total_epc_mem(void) TEST_F(enclave, unclobbered_vdso_oversubscribed) { + struct encl_op_get_from_buf get_op; + struct encl_op_put_to_buf put_op; unsigned long total_mem; - struct encl_op_put put_op; - struct encl_op_get get_op; total_mem = get_total_epc_mem(); ASSERT_NE(total_mem, 0); @@ -302,7 +302,7 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; - put_op.header.type = ENCL_OP_PUT; + put_op.header.type = ENCL_OP_PUT_TO_BUFFER; put_op.value = MAGIC; EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); @@ -310,7 +310,7 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - get_op.header.type = ENCL_OP_GET; + get_op.header.type = ENCL_OP_GET_FROM_BUFFER; get_op.value = 0; EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); @@ -323,15 +323,15 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) TEST_F(enclave, clobbered_vdso) { - struct encl_op_put put_op; - struct encl_op_get get_op; + struct encl_op_get_from_buf get_op; + struct encl_op_put_to_buf put_op; ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); memset(&self->run, 0, sizeof(self->run)); self->run.tcs = self->encl.encl_base; - put_op.header.type = ENCL_OP_PUT; + put_op.header.type = ENCL_OP_PUT_TO_BUFFER; put_op.value = MAGIC; EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0); @@ -339,7 +339,7 @@ TEST_F(enclave, clobbered_vdso) EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - get_op.header.type = ENCL_OP_GET; + get_op.header.type = ENCL_OP_GET_FROM_BUFFER; get_op.value = 0; EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0); @@ -359,8 +359,8 @@ static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r TEST_F(enclave, clobbered_vdso_and_user_function) { - struct encl_op_put put_op; - struct encl_op_get get_op; + struct encl_op_get_from_buf get_op; + struct encl_op_put_to_buf put_op; ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); @@ -370,7 +370,7 @@ TEST_F(enclave, clobbered_vdso_and_user_function) self->run.user_handler = (__u64)test_handler; self->run.user_data = 0xdeadbeef; - put_op.header.type = ENCL_OP_PUT; + put_op.header.type = ENCL_OP_PUT_TO_BUFFER; put_op.value = MAGIC; EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0); @@ -378,7 +378,7 @@ TEST_F(enclave, clobbered_vdso_and_user_function) EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); - get_op.header.type = ENCL_OP_GET; + get_op.header.type = ENCL_OP_GET_FROM_BUFFER; get_op.value = 0; EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0); diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index f11eb8315704..4e8da738173f 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -16,16 +16,16 @@ static void *memcpy(void *dest, const void *src, size_t n) return dest; } -static void do_encl_op_put(void *op) +static void do_encl_op_put_to_buf(void *op) { - struct encl_op_put *op2 = op; + struct encl_op_put_to_buf *op2 = op; memcpy(&encl_buffer[0], &op2->value, 8); } -static void do_encl_op_get(void *op) +static void do_encl_op_get_from_buf(void *op) { - struct encl_op_get *op2 = op; + struct encl_op_get_from_buf *op2 = op; memcpy(&op2->value, &encl_buffer[0], 8); } @@ -33,8 +33,8 @@ static void do_encl_op_get(void *op) void encl_body(void *rdi, void *rsi) { const void (*encl_op_array[ENCL_OP_MAX])(void *) = { - do_encl_op_put, - do_encl_op_get, + do_encl_op_put_to_buf, + do_encl_op_get_from_buf, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- Gitee From eaabcea1d4044af7418b4223bf28d7e4cb3c5211 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 10:35:24 -0800 Subject: [PATCH 080/134] selftests/sgx: Add page permission and exception test commit abc5cec4735080d12d644c2d39f96cf98c0a367c upstream. The Enclave Page Cache Map (EPCM) is a secure structure used by the processor to track the contents of the enclave page cache. The EPCM contains permissions with which enclave pages can be accessed. SGX support allows EPCM and PTE page permissions to differ - as long as the PTE permissions do not exceed the EPCM permissions. Add a test that: (1) Creates an SGX enclave page with writable EPCM permission. (2) Changes the PTE permission on the page to read-only. This should be permitted because the permission does not exceed the EPCM permission. (3) Attempts a write to the page. This should generate a page fault (#PF) because of the read-only PTE even though the EPCM permissions allow the page to be written to. This introduces the first test of SGX exception handling. In this test the issue that caused the exception (PTE page permissions) can be fixed from outside the enclave and after doing so it is possible to re-enter enclave at original entrypoint with ERESUME. Intel-SIG: commit abc5cec47350 selftests/sgx: Add page permission and exception test. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/3bcc73a4b9fe8780bdb40571805e7ced59e01df7.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 14 +++ tools/testing/selftests/sgx/main.c | 134 ++++++++++++++++++++++++ tools/testing/selftests/sgx/test_encl.c | 21 ++++ 3 files changed, 169 insertions(+) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 9ea0c7882dfb..0bbda6f0c7d3 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -21,6 +21,8 @@ enum encl_op_type { ENCL_OP_PUT_TO_BUFFER, ENCL_OP_GET_FROM_BUFFER, + ENCL_OP_PUT_TO_ADDRESS, + ENCL_OP_GET_FROM_ADDRESS, ENCL_OP_MAX, }; @@ -38,4 +40,16 @@ struct encl_op_get_from_buf { uint64_t value; }; +struct encl_op_put_to_addr { + struct encl_op_header header; + uint64_t value; + uint64_t addr; +}; + +struct encl_op_get_from_addr { + struct encl_op_header header; + uint64_t value; + uint64_t addr; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 1ad64fb4cb0d..122f56776962 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -22,6 +22,7 @@ #include "main.h" static const uint64_t MAGIC = 0x1122334455667788ULL; +static const uint64_t MAGIC2 = 0x8877665544332211ULL; vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; struct vdso_symtab { @@ -108,6 +109,25 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) return NULL; } +/* + * Return the offset in the enclave where the data segment can be found. + * The first RW segment loaded is the TCS, skip that to get info on the + * data segment. + */ +static off_t encl_get_data_offset(struct encl *encl) +{ + int i; + + for (i = 1; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (seg->prot == (PROT_READ | PROT_WRITE)) + return seg->offset; + } + + return -1; +} + FIXTURE(enclave) { struct encl encl; struct sgx_enclave_run run; @@ -388,4 +408,118 @@ TEST_F(enclave, clobbered_vdso_and_user_function) EXPECT_EQ(self->run.user_data, 0); } +/* + * Second page of .data segment is used to test changing PTE permissions. + * This spans the local encl_buffer within the test enclave. + * + * 1) Start with a sanity check: a value is written to the target page within + * the enclave and read back to ensure target page can be written to. + * 2) Change PTE permissions (RW -> RO) of target page within enclave. + * 3) Repeat (1) - this time expecting a regular #PF communicated via the + * vDSO. + * 4) Change PTE permissions of target page within enclave back to be RW. + * 5) Repeat (1) by resuming enclave, now expected to be possible to write to + * and read from target page within enclave. + */ +TEST_F(enclave, pte_permissions) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + unsigned long data_start; + int ret; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + + PAGE_SIZE; + + /* + * Sanity check to ensure it is possible to write to page that will + * have its permissions manipulated. + */ + + /* Write MAGIC to page */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that it is the + * value previously written (MAGIC). + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Change PTE permissions of target page within the enclave */ + ret = mprotect((void *)data_start, PAGE_SIZE, PROT_READ); + if (ret) + perror("mprotect"); + + /* + * PTE permissions of target page changed to read-only, EPCM + * permissions unchanged (EPCM permissions are RW), attempt to + * write to the page, expecting a regular #PF. + */ + + put_addr_op.value = MAGIC2; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x7); + EXPECT_EQ(self->run.exception_addr, data_start); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Change PTE permissions back to enable enclave to write to the + * target page and resume enclave - do not expect any exceptions this + * time. + */ + ret = mprotect((void *)data_start, PAGE_SIZE, PROT_READ | PROT_WRITE); + if (ret) + perror("mprotect"); + + EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, + 0, ERESUME, 0, 0, &self->run), + 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC2); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 4e8da738173f..5d86e3e6456a 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -4,6 +4,11 @@ #include #include "defines.h" +/* + * Data buffer spanning two pages that will be placed first in .data + * segment. Even if not used internally the second page is needed by + * external test manipulating page permissions. + */ static uint8_t encl_buffer[8192] = { 1 }; static void *memcpy(void *dest, const void *src, size_t n) @@ -30,11 +35,27 @@ static void do_encl_op_get_from_buf(void *op) memcpy(&op2->value, &encl_buffer[0], 8); } +static void do_encl_op_put_to_addr(void *_op) +{ + struct encl_op_put_to_addr *op = _op; + + memcpy((void *)op->addr, &op->value, 8); +} + +static void do_encl_op_get_from_addr(void *_op) +{ + struct encl_op_get_from_addr *op = _op; + + memcpy(&op->value, (void *)op->addr, 8); +} + void encl_body(void *rdi, void *rsi) { const void (*encl_op_array[ENCL_OP_MAX])(void *) = { do_encl_op_put_to_buf, do_encl_op_get_from_buf, + do_encl_op_put_to_addr, + do_encl_op_get_from_addr, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- Gitee From f744ababd4b8753e996295e602b65fa9b5bad0dd Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 10:35:25 -0800 Subject: [PATCH 081/134] selftests/sgx: Enable multiple thread support commit 26e688f1263a3c226f3bb5e3441c310ae11e8001 upstream. Each thread executing in an enclave is associated with a Thread Control Structure (TCS). The test enclave contains two hardcoded TCS. Each TCS contains meta-data used by the hardware to save and restore thread specific information when entering/exiting the enclave. The two TCS structures within the test enclave share their SSA (State Save Area) resulting in the threads clobbering each other's data. Fix this by providing each TCS their own SSA area. Additionally, there is an 8K stack space and its address is computed from the enclave entry point which is correctly done for TCS #1 that starts on the first address inside the enclave but results in out of bounds memory when entering as TCS #2. Split 8K stack space into two separate pages with offset symbol between to ensure the current enclave entry calculation can continue to be used for both threads. While using the enclave with multiple threads requires these fixes the impact is not apparent because every test up to this point enters the enclave from the first TCS. More detail about the stack fix: ------------------------------- Before this change the test enclave (test_encl) looks as follows: .tcs (2 pages): (page 1) TCS #1 (page 2) TCS #2 .text (1 page) One page of code .data (5 pages) (page 1) encl_buffer (page 2) encl_buffer (page 3) SSA (page 4 and 5) STACK encl_stack: As shown above there is a symbol, encl_stack, that points to the end of the .data segment (pointing to the end of page 5 in .data) which is also the end of the enclave. The enclave entry code computes the stack address by adding encl_stack to the pointer to the TCS that entered the enclave. When entering at TCS #1 the stack is computed correctly but when entering at TCS #2 the stack pointer would point to one page beyond the end of the enclave and a #PF would result when TCS #2 attempts to enter the enclave. The fix involves moving the encl_stack symbol between the two stack pages. Doing so enables the stack address computation in the entry code to compute the correct stack address for each TCS. Intel-SIG: commit 26e688f1263a selftests/sgx: Enable multiple thread support. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/a49dc0d85401db788a0a3f0d795e848abf3b1f44.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- .../selftests/sgx/test_encl_bootstrap.S | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S index 5d5680d4ea39..82fb0dfcbd23 100644 --- a/tools/testing/selftests/sgx/test_encl_bootstrap.S +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -12,7 +12,7 @@ .fill 1, 8, 0 # STATE (set by CPU) .fill 1, 8, 0 # FLAGS - .quad encl_ssa # OSSA + .quad encl_ssa_tcs1 # OSSA .fill 1, 4, 0 # CSSA (set by CPU) .fill 1, 4, 1 # NSSA .quad encl_entry # OENTRY @@ -23,10 +23,10 @@ .fill 1, 4, 0xFFFFFFFF # GSLIMIT .fill 4024, 1, 0 # Reserved - # Identical to the previous TCS. + # TCS2 .fill 1, 8, 0 # STATE (set by CPU) .fill 1, 8, 0 # FLAGS - .quad encl_ssa # OSSA + .quad encl_ssa_tcs2 # OSSA .fill 1, 4, 0 # CSSA (set by CPU) .fill 1, 4, 1 # NSSA .quad encl_entry # OENTRY @@ -40,8 +40,9 @@ .text encl_entry: - # RBX contains the base address for TCS, which is also the first address - # inside the enclave. By adding the value of le_stack_end to it, we get + # RBX contains the base address for TCS, which is the first address + # inside the enclave for TCS #1 and one page into the enclave for + # TCS #2. By adding the value of encl_stack to it, we get # the absolute address for the stack. lea (encl_stack)(%rbx), %rax xchg %rsp, %rax @@ -81,9 +82,15 @@ encl_entry: .section ".data", "aw" -encl_ssa: +encl_ssa_tcs1: + .space 4096 +encl_ssa_tcs2: .space 4096 .balign 4096 - .space 8192 + # Stack of TCS #1 + .space 4096 encl_stack: + .balign 4096 + # Stack of TCS #2 + .space 4096 -- Gitee From 074bdc3a274be46bc25ab41e475f62bd56156412 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 10:35:26 -0800 Subject: [PATCH 082/134] selftests/sgx: Add test for multiple TCS entry commit 688542e29fae655a8be25832f6a9959bdd308dd8 upstream. Each thread executing in an enclave is associated with a Thread Control Structure (TCS). The SGX test enclave contains two hardcoded TCS, thus supporting two threads in the enclave. Add a test to ensure it is possible to enter enclave at both entrypoints. Intel-SIG: commit 688542e29fae selftests/sgx: Add test for multiple TCS entry. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/7be151a57b4c7959a2364753b995e0006efa3da1.1636997631.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 1 + tools/testing/selftests/sgx/main.c | 32 +++++++++++++++++++++++++ tools/testing/selftests/sgx/test_encl.c | 6 +++++ 3 files changed, 39 insertions(+) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 0bbda6f0c7d3..02d775789ea7 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -23,6 +23,7 @@ enum encl_op_type { ENCL_OP_GET_FROM_BUFFER, ENCL_OP_PUT_TO_ADDRESS, ENCL_OP_GET_FROM_ADDRESS, + ENCL_OP_NOP, ENCL_OP_MAX, }; diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 122f56776962..370c4995f7c4 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -408,6 +408,38 @@ TEST_F(enclave, clobbered_vdso_and_user_function) EXPECT_EQ(self->run.user_data, 0); } +/* + * Sanity check that it is possible to enter either of the two hardcoded TCS + */ +TEST_F(enclave, tcs_entry) +{ + struct encl_op_header op; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + op.type = ENCL_OP_NOP; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Move to the next TCS. */ + self->run.tcs = self->encl.encl_base + PAGE_SIZE; + + EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); +} + /* * Second page of .data segment is used to test changing PTE permissions. * This spans the local encl_buffer within the test enclave. diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 5d86e3e6456a..4fca01cfd898 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -49,6 +49,11 @@ static void do_encl_op_get_from_addr(void *_op) memcpy(&op->value, (void *)op->addr, 8); } +static void do_encl_op_nop(void *_op) +{ + +} + void encl_body(void *rdi, void *rsi) { const void (*encl_op_array[ENCL_OP_MAX])(void *) = { @@ -56,6 +61,7 @@ void encl_body(void *rdi, void *rsi) do_encl_op_get_from_buf, do_encl_op_put_to_addr, do_encl_op_get_from_addr, + do_encl_op_nop, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- Gitee From a128273e3ef9782ea45aca54c4ee6f952bd9a365 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 29 Oct 2021 10:49:56 -0700 Subject: [PATCH 083/134] x86/sgx: Fix minor documentation issues commit 379e4de9e140850cf699dd390f21ea4b923c955d upstream. The SGX documentation has a few repeated or one-off issues: * Remove capitalization from regular words in the middle of a sentence. * Remove punctuation found in the middle of a sentence. * Fix name of SGX daemon to consistently be ksgxd. * Fix typo of SGX instruction: ENIT -> EINIT [ dhansen: tweaked subject and changelog ] Intel-SIG: commit 379e4de9e140 x86/sgx: Fix minor documentation issues. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/ab99a87368eef69e3fb96f073368becff3eff874.1635529506.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index a608f667fb95..265568a9292c 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -10,7 +10,7 @@ Overview Software Guard eXtensions (SGX) hardware enables for user space applications to set aside private memory regions of code and data: -* Privileged (ring-0) ENCLS functions orchestrate the construction of the. +* Privileged (ring-0) ENCLS functions orchestrate the construction of the regions. * Unprivileged (ring-3) ENCLU functions allow an application to enter and execute inside the regions. @@ -91,7 +91,7 @@ In addition to the traditional compiler and linker build process, SGX has a separate enclave “build” process. Enclaves must be built before they can be executed (entered). The first step in building an enclave is opening the **/dev/sgx_enclave** device. Since enclave memory is protected from direct -access, special privileged instructions are Then used to copy data into enclave +access, special privileged instructions are then used to copy data into enclave pages and establish enclave page permissions. .. kernel-doc:: arch/x86/kernel/cpu/sgx/ioctl.c @@ -126,13 +126,13 @@ the need to juggle signal handlers. ksgxd ===== -SGX support includes a kernel thread called *ksgxwapd*. +SGX support includes a kernel thread called *ksgxd*. EPC sanitization ---------------- ksgxd is started when SGX initializes. Enclave memory is typically ready -For use when the processor powers on or resets. However, if SGX has been in +for use when the processor powers on or resets. However, if SGX has been in use since the reset, enclave pages may be in an inconsistent state. This might occur after a crash and kexec() cycle, for instance. At boot, ksgxd reinitializes all enclave pages so that they can be allocated and re-used. @@ -147,7 +147,7 @@ Page reclaimer Similar to the core kswapd, ksgxd, is responsible for managing the overcommitment of enclave memory. If the system runs out of enclave memory, -*ksgxwapd* “swaps” enclave memory to normal memory. +*ksgxd* “swaps” enclave memory to normal memory. Launch Control ============== @@ -156,7 +156,7 @@ SGX provides a launch control mechanism. After all enclave pages have been copied, kernel executes EINIT function, which initializes the enclave. Only after this the CPU can execute inside the enclave. -ENIT function takes an RSA-3072 signature of the enclave measurement. The function +EINIT function takes an RSA-3072 signature of the enclave measurement. The function checks that the measurement is correct and signature is signed with the key hashed to the four **IA32_SGXLEPUBKEYHASH{0, 1, 2, 3}** MSRs representing the SHA256 of a public key. @@ -184,7 +184,7 @@ CPUs starting from Icelake use Total Memory Encryption (TME) in the place of MEE. TME-based SGX implementations do not have an integrity Merkle tree, which means integrity and replay-attacks are not mitigated. B, it includes additional changes to prevent cipher text from being returned and SW memory -aliases from being Created. +aliases from being created. DMA to enclave memory is blocked by range registers on both MEE and TME systems (SDM section 41.10). -- Gitee From 36e9222e730082efed8954e12dbe18296895a300 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 16 Nov 2021 18:21:16 +0200 Subject: [PATCH 084/134] x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node commit 50468e4313355b161cac8a5155a45832995b7f25 upstream. == Problem == The amount of SGX memory on a system is determined by the BIOS and it varies wildly between systems. It can be as small as dozens of MB's and as large as many GB's on servers. Just like how applications need to know how much regular RAM is available, enclave builders need to know how much SGX memory an enclave can consume. == Solution == Introduce a new sysfs file: /sys/devices/system/node/nodeX/x86/sgx_total_bytes to enumerate the amount of SGX memory available in each NUMA node. This serves the same function for SGX as /proc/meminfo or /sys/devices/system/node/nodeX/meminfo does for normal RAM. 'sgx_total_bytes' is needed today to help drive the SGX selftests. SGX-specific swap code is exercised by creating overcommitted enclaves which are larger than the physical SGX memory on the system. They currently use a CPUID-based approach which can diverge from the actual amount of SGX memory available. 'sgx_total_bytes' ensures that the selftests can work efficiently and do not attempt stupid things like creating a 100,000 MB enclave on a system with 128 MB of SGX memory. == Implementation Details == Introduce CONFIG_HAVE_ARCH_NODE_DEV_GROUP opt-in flag to expose an arch specific attribute group, and add an attribute for the amount of SGX memory in bytes to each NUMA node: == ABI Design Discussion == As opposed to the per-node ABI, a single, global ABI was considered. However, this would prevent enclaves from being able to size themselves so that they fit on a single NUMA node. Essentially, a single value would rule out NUMA optimizations for enclaves. Create a new "x86/" directory inside each "nodeX/" sysfs directory. 'sgx_total_bytes' is expected to be the first of at least a few sgx-specific files to be placed in the new directory. Just scanning /proc/meminfo, these are the no-brainers that we have for RAM, but we need for SGX: MemTotal: xxxx kB // sgx_total_bytes (implemented here) MemFree: yyyy kB // sgx_free_bytes SwapTotal: zzzz kB // sgx_swapped_bytes So, at *least* three. I think we will eventually end up needing something more along the lines of a dozen. A new directory (as opposed to being in the nodeX/ "root") directory avoids cluttering the root with several "sgx_*" files. Place the new file in a new "nodeX/x86/" directory because SGX is highly x86-specific. It is very unlikely that any other architecture (or even non-Intel x86 vendor) will ever implement SGX. Using "sgx/" as opposed to "x86/" was also considered. But, there is a real chance this can get used for other arch-specific purposes. [ dhansen: rewrite changelog ] Intel-SIG: commit 50468e431335 x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node. Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Acked-by: Greg Kroah-Hartman Acked-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211116162116.93081-2-jarkko@kernel.org [ Zhiquan: This patch adds a new element into array node_dev_groups[], however, in 5.4 code the array is defined by macro ATTRIBUTE_GROUPS(node_dev). Upstream expands this macro at commit 75bd50fa841d ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI"), it's one of series https://lore.kernel.org/all/20210806110251.560-1-song.bao.hua@hisilicon.com which involves a lot of additional changes. Unfortunately, this series need some fixes that involves more and more dependencies. To keep the thing simple, just expand the macro without functional changes. ] Signed-off-by: Zhiquan Li --- Documentation/ABI/stable/sysfs-devices-node | 6 ++++++ arch/Kconfig | 4 ++++ arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/sgx/main.c | 20 ++++++++++++++++++++ arch/x86/kernel/cpu/sgx/sgx.h | 1 + drivers/base/node.c | 13 ++++++++++++- include/linux/numa.h | 4 ++++ 7 files changed, 48 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index df8413cf1468..7ba802e6ab0c 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -176,3 +176,9 @@ Contact: Keith Busch Description: The cache write policy: 0 for write-back, 1 for write-through, other or unknown. + +What: /sys/devices/system/node/nodeX/x86/sgx_total_bytes +Date: November 2021 +Contact: Jarkko Sakkinen +Description: + The total amount of SGX physical memory in bytes. diff --git a/arch/Kconfig b/arch/Kconfig index 6cb98cd17acf..a612f8e49bb1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -972,6 +972,10 @@ config ARCH_HAS_MEM_ENCRYPT config DYNAMIC_SIGFRAME bool +# Select, if arch has a named attribute group bound to NUMA device nodes. +config HAVE_ARCH_NODE_DEV_GROUP + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e9c1b1428ce0..d05531a4ea77 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -227,6 +227,7 @@ config X86 select VIRT_TO_BUS select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS + select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 89802fd3cbb8..ccb4cdd22717 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -825,9 +825,11 @@ static bool __init sgx_page_cache_init(void) INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; } sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; sgx_nr_epc_sections++; } @@ -901,6 +903,24 @@ int sgx_set_attribute(unsigned long *allowed_attributes, } EXPORT_SYMBOL_GPL(sgx_set_attribute); +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, +}; +#endif /* CONFIG_NUMA */ + static int __init sgx_init(void) { int ret; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 9ec3136c7800..0f17def9fe6f 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -44,6 +44,7 @@ struct sgx_epc_page { struct sgx_numa_node { struct list_head free_page_list; struct list_head sgx_poison_page_list; + unsigned long size; spinlock_t lock; }; diff --git a/drivers/base/node.c b/drivers/base/node.c index c9976dc4aa65..5be6fceb4967 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -544,7 +544,18 @@ static struct attribute *node_dev_attrs[] = { &dev_attr_vmstat.attr, NULL }; -ATTRIBUTE_GROUPS(node_dev); + +static const struct attribute_group node_dev_group = { + .attrs = node_dev_attrs, +}; + +static const struct attribute_group *node_dev_groups[] = { + &node_dev_group, +#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP + &arch_node_dev_group, +#endif + NULL +}; #ifdef CONFIG_HUGETLBFS /* diff --git a/include/linux/numa.h b/include/linux/numa.h index a42df804679e..f8bd68110d33 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -41,4 +41,8 @@ static inline int phys_to_target_node(phys_addr_t addr) } #endif +#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP +extern const struct attribute_group arch_node_dev_group; +#endif + #endif /* _LINUX_NUMA_H */ -- Gitee From c3ebff4209801839bcc7cde1ef853519f0498fc2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 Jan 2022 09:15:27 -0800 Subject: [PATCH 085/134] x86/sgx: Fix NULL pointer dereference on non-SGX systems commit 2056e2989bf47ad7274ecc5e9dda2add53c112f9 upstream. == Problem == Nathan Chancellor reported an oops when aceessing the 'sgx_total_bytes' sysfs file: https://lore.kernel.org/all/YbzhBrimHGGpddDM@archlinux-ax161/ The sysfs output code accesses the sgx_numa_nodes[] array unconditionally. However, this array is allocated during SGX initialization, which only occurs on systems where SGX is supported. If the sysfs file is accessed on systems without SGX support, sgx_numa_nodes[] is NULL and an oops occurs. == Solution == To fix this, hide the entire nodeX/x86/ attribute group on systems without SGX support using the ->is_visible attribute group callback. Unfortunately, SGX is initialized via a device_initcall() which occurs _after_ the ->is_visible() callback. Instead of moving SGX initialization earlier, call sysfs_update_group() during SGX initialization to update the group visiblility. This update requires moving the SGX sysfs code earlier in sgx/main.c. There are no code changes other than the addition of arch_update_sysfs_visibility() and a minor whitespace fixup to arch_node_attr_is_visible() which checkpatch caught. Intel-SIG: commit 2056e2989bf4 x86/sgx: Fix NULL pointer dereference on non-SGX systems. Backport for SGX EDMM support. CC: Greg Kroah-Hartman Cc: linux-sgx@vger.kernel.org Cc: x86@kernel.org Fixes: 50468e431335 ("x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node") Reported-by: Nathan Chancellor Signed-off-by: Dave Hansen Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jarkko Sakkinen Tested-by: Nathan Chancellor Tested-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/20220104171527.5E8416A8@davehans-spike.ostc.intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 65 ++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index ccb4cdd22717..49e7ea6d1ae5 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -6,11 +6,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include "driver.h" #include "encl.h" @@ -780,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) ((high & GENMASK_ULL(19, 0)) << 32); } +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; @@ -826,6 +870,9 @@ static bool __init sgx_page_cache_init(void) INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); } sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; @@ -903,24 +950,6 @@ int sgx_set_attribute(unsigned long *allowed_attributes, } EXPORT_SYMBOL_GPL(sgx_set_attribute); -#ifdef CONFIG_NUMA -static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); -} -static DEVICE_ATTR_RO(sgx_total_bytes); - -static struct attribute *arch_node_dev_attrs[] = { - &dev_attr_sgx_total_bytes.attr, - NULL, -}; - -const struct attribute_group arch_node_dev_group = { - .name = "x86", - .attrs = arch_node_dev_attrs, -}; -#endif /* CONFIG_NUMA */ - static int __init sgx_init(void) { int ret; -- Gitee From 0e1a4c5ac3253abce443fb626f130bb313e44e1c Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 2 Feb 2022 11:41:12 -0800 Subject: [PATCH 086/134] x86/sgx: Fix missing poison handling in reclaimer commit e5733d8c89c3b57c8fcd40b8acf508388fabaa42 upstream. The SGX reclaimer code lacks page poison handling in its main free path. This can lead to avoidable machine checks if a poisoned page is freed and reallocated instead of being isolated. A troublesome scenario is: 1. Machine check (#MC) occurs (asynchronous, !MF_ACTION_REQUIRED) 2. arch_memory_failure() is eventually called 3. (SGX) page->poison set to 1 4. Page is reclaimed 5. Page added to normal free lists by sgx_reclaim_pages() ^ This is the bug (poison pages should be isolated on the sgx_poison_page_list instead) 6. Page is reallocated by some innocent enclave, a second (synchronous) in-kernel #MC is induced, probably during EADD instruction. ^ This is the fallout from the bug (6) is unfortunate and can be avoided by replacing the open coded enclave page freeing code in the reclaimer with sgx_free_epc_page() to obtain support for poison page handling that includes placing the poisoned page on the correct list. Intel-SIG: commit e5733d8c89c3 x86/sgx: Fix missing poison handling in reclaimer. Backport for SGX EDMM support. Fixes: d6d261bded8a ("x86/sgx: Add new sgx_epc_page flag bit to mark free pages") Fixes: 992801ae9243 ("x86/sgx: Initial poison handling for dirty and free pages") Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/dcc95eb2aaefb042527ac50d0a50738c7c160dac.1643830353.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 49e7ea6d1ae5..6227dc75a596 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -344,10 +344,8 @@ static void sgx_reclaim_pages(void) { struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; struct sgx_backing backing[SGX_NR_TO_SCAN]; - struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; - struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -418,13 +416,7 @@ static void sgx_reclaim_pages(void) kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; - section = &sgx_epc_sections[epc_page->section]; - node = section->node; - - spin_lock(&node->lock); - list_add_tail(&epc_page->list, &node->free_page_list); - spin_unlock(&node->lock); - atomic_long_inc(&sgx_nr_free_pages); + sgx_free_epc_page(epc_page); } } -- Gitee From 390d464801c5be73cd7acf4863b1edfc2919a7b5 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 4 Mar 2022 00:38:58 +0200 Subject: [PATCH 087/134] x86/sgx: Free backing memory after faulting the enclave page commit 08999b2489b4c9b939d7483dbd03702ee4576d96 upstream. There is a limited amount of SGX memory (EPC) on each system. When that memory is used up, SGX has its own swapping mechanism which is similar in concept but totally separate from the core mm/* code. Instead of swapping to disk, SGX swaps from EPC to normal RAM. That normal RAM comes from a shared memory pseudo-file and can itself be swapped by the core mm code. There is a hierarchy like this: EPC <-> shmem <-> disk After data is swapped back in from shmem to EPC, the shmem backing storage needs to be freed. Currently, the backing shmem is not freed. This effectively wastes the shmem while the enclave is running. The memory is recovered when the enclave is destroyed and the backing storage freed. Sort this out by freeing memory with shmem_truncate_range(), as soon as a page is faulted back to the EPC. In addition, free the memory for PCMD pages as soon as all PCMD's in a page have been marked as unused by zeroing its contents. Intel-SIG: commit 08999b2489b4 x86/sgx: Free backing memory after faulting the enclave page. Backport for SGX EDMM support. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Dave Hansen Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220303223859.273187-1-jarkko@kernel.org [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 9f978d0cb506..4e279d12becf 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,30 @@ #include "encls.h" #include "sgx.h" +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + /* * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC * Pages" in the SDM. @@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, { unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; struct sgx_pageinfo pginfo; struct sgx_backing b; - pgoff_t page_index; + bool pcmd_page_empty; + u8 *pcmd_page; int ret; if (secs_page) @@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + ret = sgx_encl_get_backing(encl, page_index, &b); if (ret) return ret; pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + - b.pcmd_offset; + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); @@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = -EFAULT; } - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); sgx_encl_put_backing(&b, false); + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty) + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + return ret; } @@ -579,7 +620,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); struct page *contents; struct page *pcmd; @@ -587,7 +628,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, if (IS_ERR(contents)) return PTR_ERR(contents); - pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); if (IS_ERR(pcmd)) { put_page(contents); return PTR_ERR(pcmd); @@ -596,9 +637,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; - backing->pcmd_offset = - (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * - sizeof(struct sgx_pcmd); + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); return 0; } -- Gitee From d8f7ca7e49b0a918ca7985b8a3f16a6f21a443cd Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:50:57 -0700 Subject: [PATCH 088/134] x86/sgx: Disconnect backing page references from dirty status commit 6bd429643cc265e94a9d19839c771bcc5d008fa8 upstream. SGX uses shmem backing storage to store encrypted enclave pages and their crypto metadata when enclave pages are moved out of enclave memory. Two shmem backing storage pages are associated with each enclave page - one backing page to contain the encrypted enclave page data and one backing page (shared by a few enclave pages) to contain the crypto metadata used by the processor to verify the enclave page when it is loaded back into the enclave. sgx_encl_put_backing() is used to release references to the backing storage and, optionally, mark both backing store pages as dirty. Managing references and dirty status together in this way results in both backing store pages marked as dirty, even if only one of the backing store pages are changed. Additionally, waiting until the page reference is dropped to set the page dirty risks a race with the page fault handler that may load outdated data into the enclave when a page is faulted right after it is reclaimed. Consider what happens if the reclaimer writes a page to the backing store and the page is immediately faulted back, before the reclaimer is able to set the dirty bit of the page: sgx_reclaim_pages() { sgx_vma_fault() { ... sgx_encl_get_backing(); ... ... sgx_reclaimer_write() { mutex_lock(&encl->lock); /* Write data to backing store */ mutex_unlock(&encl->lock); } mutex_lock(&encl->lock); __sgx_encl_eldu() { ... /* * Enclave backing store * page not released * nor marked dirty - * contents may not be * up to date. */ sgx_encl_get_backing(); ... /* * Enclave data restored * from backing store * and PCMD pages that * are not up to date. * ENCLS[ELDU] faults * because of MAC or PCMD * checking failure. */ sgx_encl_put_backing(); } ... /* set page dirty */ sgx_encl_put_backing(); ... mutex_unlock(&encl->lock); } } Remove the option to sgx_encl_put_backing() to set the backing pages as dirty and set the needed pages as dirty right after receiving important data while enclave mutex is held. This ensures that the page fault handler can get up to date data from a page and prepares the code for a following change where only one of the backing pages need to be marked as dirty. Intel-SIG: commit 6bd429643cc2 x86/sgx: Disconnect backing page references from dirty status. Backport for SGX EDMM support. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lore.kernel.org/linux-sgx/8922e48f-6646-c7cc-6393-7c78dcf23d23@intel.com/ Link: https://lkml.kernel.org/r/fa9f98986923f43e72ef4c6702a50b2a0b3c42e3.1652389823.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 10 ++-------- arch/x86/kernel/cpu/sgx/encl.h | 2 +- arch/x86/kernel/cpu/sgx/main.c | 6 ++++-- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 4e279d12becf..5d8bdc053e38 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -94,7 +94,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); - sgx_encl_put_backing(&b, false); + sgx_encl_put_backing(&b); sgx_encl_truncate_backing_page(encl, page_index); @@ -645,15 +645,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, /** * sgx_encl_put_backing() - Unpin the backing storage * @backing: data for accessing backing storage for the page - * @do_write: mark pages dirty */ -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +void sgx_encl_put_backing(struct sgx_backing *backing) { - if (do_write) { - set_page_dirty(backing->pcmd); - set_page_dirty(backing->contents); - } - put_page(backing->pcmd); put_page(backing->contents); } diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index fec43ca65065..d44e7372151f 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 6227dc75a596..4766f47fc8ec 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); kunmap_atomic((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); @@ -320,7 +322,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; - sgx_encl_put_backing(&secs_backing, true); + sgx_encl_put_backing(&secs_backing); } out: @@ -411,7 +413,7 @@ static void sgx_reclaim_pages(void) encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i], true); + sgx_encl_put_backing(&backing[i]); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; -- Gitee From 8033e73d41c62f75d98567195c43240e1f0490c4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:50:58 -0700 Subject: [PATCH 089/134] x86/sgx: Mark PCMD page as dirty when modifying contents commit 2154e1c11b7080aa19f47160bd26b6f39bbd7824 upstream. Recent commit 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") expanded __sgx_encl_eldu() to clear an enclave page's PCMD (Paging Crypto MetaData) from the PCMD page in the backing store after the enclave page is restored to the enclave. Since the PCMD page in the backing store is modified the page should be marked as dirty to ensure the modified data is retained. Intel-SIG: commit 2154e1c11b70 x86/sgx: Mark PCMD page as dirty when modifying contents. Backport for SGX EDMM support. Cc: stable@vger.kernel.org Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/00cd2ac480db01058d112e347b32599c1a806bc4.1652389823.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 5d8bdc053e38..09ef0660ff97 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -84,6 +84,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, } memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); /* * The area for the PCMD in the page was zeroed above. Check if the -- Gitee From 0821a1e89f6110315e5a8c9fbbd7abf489a37329 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:50:59 -0700 Subject: [PATCH 090/134] x86/sgx: Obtain backing storage page with enclave mutex held commit 0e4e729a830c1e7f31d3b3fbf8feb355a402b117 upstream. Haitao reported encountering a WARN triggered by the ENCLS[ELDU] instruction faulting with a #GP. The WARN is encountered when the reclaimer evicts a range of pages from the enclave when the same pages are faulted back right away. The SGX backing storage is accessed on two paths: when there are insufficient free pages in the EPC the reclaimer works to move enclave pages to the backing storage and as enclaves access pages that have been moved to the backing storage they are retrieved from there as part of page fault handling. An oversubscribed SGX system will often run the reclaimer and page fault handler concurrently and needs to ensure that the backing store is accessed safely between the reclaimer and the page fault handler. This is not the case because the reclaimer accesses the backing store without the enclave mutex while the page fault handler accesses the backing store with the enclave mutex. Consider the scenario where a page is faulted while a page sharing a PCMD page with the faulted page is being reclaimed. The consequence is a race between the reclaimer and page fault handler, the reclaimer attempting to access a PCMD at the same time it is truncated by the page fault handler. This could result in lost PCMD data. Data may still be lost if the reclaimer wins the race, this is addressed in the following patch. The reclaimer accesses pages from the backing storage without holding the enclave mutex and runs the risk of concurrently accessing the backing storage with the page fault handler that does access the backing storage with the enclave mutex held. In the scenario below a PCMD page is truncated from the backing store after all its pages have been loaded in to the enclave at the same time the PCMD page is loaded from the backing store when one of its pages are reclaimed: sgx_reclaim_pages() { sgx_vma_fault() { ... mutex_lock(&encl->lock); ... __sgx_encl_eldu() { ... if (pcmd_page_empty) { /* * EPC page being reclaimed /* * shares a PCMD page with an * PCMD page truncated * enclave page that is being * while requested from * faulted in. * reclaimer. */ */ sgx_encl_get_backing() <----------> sgx_encl_truncate_backing_page() } mutex_unlock(&encl->lock); } } In this scenario there is a race between the reclaimer and the page fault handler when the reclaimer attempts to get access to the same PCMD page that is being truncated. This could result in the reclaimer writing to the PCMD page that is then truncated, causing the PCMD data to be lost, or in a new PCMD page being allocated. The lost PCMD data may still occur after protecting the backing store access with the mutex - this is fixed in the next patch. By ensuring the backing store is accessed with the mutex held the enclave page state can be made accurate with the SGX_ENCL_PAGE_BEING_RECLAIMED flag accurately reflecting that a page is in the process of being reclaimed. Consistently protect the reclaimer's backing store access with the enclave's mutex to ensure that it can safely run concurrently with the page fault handler. Intel-SIG: commit 0e4e729a830c x86/sgx: Obtain backing storage page with enclave mutex held. Backport for SGX EDMM support. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/fa2e04c561a8555bfe1f4e7adc37d60efc77387b.1652389823.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 4766f47fc8ec..e3802dd707ab 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -310,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; + sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), @@ -381,11 +382,14 @@ static void sgx_reclaim_pages(void) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + + mutex_lock(&encl_page->encl->lock); ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); - if (ret) + if (ret) { + mutex_unlock(&encl_page->encl->lock); goto skip; + } - mutex_lock(&encl_page->encl->lock); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; @@ -413,7 +417,6 @@ static void sgx_reclaim_pages(void) encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i]); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; -- Gitee From 478761928fd40759b2fe0b4acc7e833cd055f824 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:51:00 -0700 Subject: [PATCH 091/134] x86/sgx: Fix race between reclaimer and page fault handler commit af117837ceb9a78e995804ade4726ad2c2c8981f upstream. Haitao reported encountering a WARN triggered by the ENCLS[ELDU] instruction faulting with a #GP. The WARN is encountered when the reclaimer evicts a range of pages from the enclave when the same pages are faulted back right away. Consider two enclave pages (ENCLAVE_A and ENCLAVE_B) sharing a PCMD page (PCMD_AB). ENCLAVE_A is in the enclave memory and ENCLAVE_B is in the backing store. PCMD_AB contains just one entry, that of ENCLAVE_B. Scenario proceeds where ENCLAVE_A is being evicted from the enclave while ENCLAVE_B is faulted in. sgx_reclaim_pages() { ... /* * Reclaim ENCLAVE_A */ mutex_lock(&encl->lock); /* * Get a reference to ENCLAVE_A's * shmem page where enclave page * encrypted data will be stored * as well as a reference to the * enclave page's PCMD data page, * PCMD_AB. * Release mutex before writing * any data to the shmem pages. */ sgx_encl_get_backing(...); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl->lock); /* * Fault ENCLAVE_B */ sgx_vma_fault() { mutex_lock(&encl->lock); /* * Get reference to * ENCLAVE_B's shmem page * as well as PCMD_AB. */ sgx_encl_get_backing(...) /* * Load page back into * enclave via ELDU. */ /* * Release reference to * ENCLAVE_B' shmem page and * PCMD_AB. */ sgx_encl_put_backing(...); /* * PCMD_AB is found empty so * it and ENCLAVE_B's shmem page * are truncated. */ /* Truncate ENCLAVE_B backing page */ sgx_encl_truncate_backing_page(); /* Truncate PCMD_AB */ sgx_encl_truncate_backing_page(); mutex_unlock(&encl->lock); ... } mutex_lock(&encl->lock); encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; /* * Write encrypted contents of * ENCLAVE_A to ENCLAVE_A shmem * page and its PCMD data to * PCMD_AB. */ sgx_encl_put_backing(...) /* * Reference to PCMD_AB is * dropped and it is truncated. * ENCLAVE_A's PCMD data is lost. */ mutex_unlock(&encl->lock); } What happens next depends on whether it is ENCLAVE_A being faulted in or ENCLAVE_B being evicted - but both end up with ENCLS[ELDU] faulting with a #GP. If ENCLAVE_A is faulted then at the time sgx_encl_get_backing() is called a new PCMD page is allocated and providing the empty PCMD data for ENCLAVE_A would cause ENCLS[ELDU] to #GP If ENCLAVE_B is evicted first then a new PCMD_AB would be allocated by the reclaimer but later when ENCLAVE_A is faulted the ENCLS[ELDU] instruction would #GP during its checks of the PCMD value and the WARN would be encountered. Noting that the reclaimer sets SGX_ENCL_PAGE_BEING_RECLAIMED at the time it obtains a reference to the backing store pages of an enclave page it is in the process of reclaiming, fix the race by only truncating the PCMD page after ensuring that no page sharing the PCMD page is in the process of being reclaimed. Intel-SIG: commit af117837ceb9 x86/sgx: Fix race between reclaimer and page fault handler. Backport for SGX EDMM support. Cc: stable@vger.kernel.org Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") Reported-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/ed20a5db516aa813873268e125680041ae11dfcf.1652389823.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 94 +++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 09ef0660ff97..92204f9d45cf 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,92 @@ #include "encls.h" #include "sgx.h" +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + /* * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's * follow right after the EPC data in the backing storage. In addition to the @@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; bool pcmd_page_empty; @@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); ret = sgx_encl_get_backing(encl, page_index, &b); @@ -99,7 +191,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, sgx_encl_truncate_backing_page(encl, page_index); - if (pcmd_page_empty) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); return ret; -- Gitee From 09160b18b659d428b28cb05fdab6a902dd539ddc Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:51:01 -0700 Subject: [PATCH 092/134] x86/sgx: Ensure no data in PCMD page after truncate commit e3a3bbe3e99de73043a1d32d36cf4d211dc58c7e upstream. A PCMD (Paging Crypto MetaData) page contains the PCMD structures of enclave pages that have been encrypted and moved to the shmem backing store. When all enclave pages sharing a PCMD page are loaded in the enclave, there is no need for the PCMD page and it can be truncated from the backing store. A few issues appeared around the truncation of PCMD pages. The known issues have been addressed but the PCMD handling code could be made more robust by loudly complaining if any new issue appears in this area. Add a check that will complain with a warning if the PCMD page is not actually empty after it has been truncated. There should never be data in the PCMD page at this point since it is was just checked to be empty and truncated with enclave mutex held and is updated with the enclave mutex held. Intel-SIG: commit e3a3bbe3e99d x86/sgx: Ensure no data in PCMD page after truncate. Backport for SGX EDMM support. Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/6495120fed43fafc1496d09dd23df922b9a32709.1652389823.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 92204f9d45cf..a4a35426d188 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -187,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); + get_page(b.pcmd); sgx_encl_put_backing(&b); sgx_encl_truncate_backing_page(encl, page_index); - if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); return ret; } -- Gitee From cbdd358925570b3f6af7382c52be6b77722c3446 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 20 May 2022 10:42:47 -0700 Subject: [PATCH 093/134] x86/sgx: Set active memcg prior to shmem allocation commit 0c9782e204d3cc5625b9e8bf4e8625d38dfe0139 upstream. When the system runs out of enclave memory, SGX can reclaim EPC pages by swapping to normal RAM. These backing pages are allocated via a per-enclave shared memory area. Since SGX allows unlimited over commit on EPC memory, the reclaimer thread can allocate a large number of backing RAM pages in response to EPC memory pressure. When the shared memory backing RAM allocation occurs during the reclaimer thread context, the shared memory is charged to the root memory control group, and the shmem usage of the enclave is not properly accounted for, making cgroups ineffective at limiting the amount of RAM an enclave can consume. For example, when using a cgroup to launch a set of test enclaves, the kernel does not properly account for 50% - 75% of shmem page allocations on average. In the worst case, when nearly all allocations occur during the reclaimer thread, the kernel accounts less than a percent of the amount of shmem used by the enclave's cgroup to the correct cgroup. SGX stores a list of mm_structs that are associated with an enclave. Pick one of them during reclaim and charge that mm's memcg with the shmem allocation. The one that gets picked is arbitrary, but this list almost always only has one mm. The cases where there is more than one mm with different memcg's are not worth considering. Create a new function - sgx_encl_alloc_backing(). This function is used whenever a new backing storage page needs to be allocated. Previously the same function was used for page allocation as well as retrieving a previously allocated page. Prior to backing page allocation, if there is a mm_struct associated with the enclave that is requesting the allocation, it is set as the active memory control group. [ dhansen: - fix merge conflict with ELDU fixes - check against actual ksgxd_tsk, not ->mm ] Intel-SIG: commit 0c9782e204d3 x86/sgx: Set active memcg prior to shmem allocation Backport for SGX virtualization support on Intel Xeon platform. Cc: stable@vger.kernel.org Signed-off-by: Kristen Carlson Accardi Signed-off-by: Dave Hansen Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Link: https://lkml.kernel.org/r/20220520174248.4918-1-kristen@linux.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 105 ++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/sgx/encl.h | 7 ++- arch/x86/kernel/cpu/sgx/main.c | 9 ++- 3 files changed, 115 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index a4a35426d188..8488e0867c95 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -152,7 +152,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); - ret = sgx_encl_get_backing(encl, page_index, &b); + ret = sgx_encl_lookup_backing(encl, page_index, &b); if (ret) return ret; @@ -718,7 +718,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); @@ -743,6 +743,107 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, return 0; } +/* + * When called from ksgxd, returns the mem_cgroup of a struct mm stored + * in the enclave's mm_list. When not called from ksgxd, just returns + * the mem_cgroup of the current task. + */ +static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) +{ + struct mem_cgroup *memcg = NULL; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * If called from normal task context, return the mem_cgroup + * of the current task's mm. The remainder of the handling is for + * ksgxd. + */ + if (!current_is_ksgxd()) + return get_mem_cgroup_from_mm(current->mm); + + /* + * Search the enclave's mm_list to find an mm associated with + * this enclave to charge the allocation to. + */ + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + memcg = get_mem_cgroup_from_mm(encl_mm->mm); + + mmput_async(encl_mm->mm); + + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + /* + * In the rare case that there isn't an mm associated with + * the enclave, set memcg to the current active mem_cgroup. + * This will be the root mem_cgroup if there is no active + * mem_cgroup. + */ + if (!memcg) + return get_mem_cgroup_from_mm(NULL); + + return memcg; +} + +/** + * sgx_encl_alloc_backing() - allocate a new backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * When called from ksgxd, sets the active memcg from one of the + * mms in the enclave's mm_list prior to any backing page allocation, + * in order to ensure that shmem page allocations are charged to the + * enclave. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); + int ret; + + ret = sgx_encl_get_backing(encl, page_index, backing); + + set_active_memcg(memcg); + mem_cgroup_put(encl_memcg); + + return ret; +} + +/** + * sgx_encl_lookup_backing() - retrieve an existing backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Retrieve a backing page for loading data back into an EPC page with ELDU. + * It is the caller's responsibility to ensure that it is appropriate to use + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is + * not used correctly, this will cause an allocation which is not accounted for. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + return sgx_encl_get_backing(encl, page_index, backing); +} + /** * sgx_encl_put_backing() - Unpin the backing storage * @backing: data for accessing backing storage for the page diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d44e7372151f..332ef3568267 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -103,10 +103,13 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags); +bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); +int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index e3802dd707ab..97877d2650be 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -313,7 +313,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { - ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), &secs_backing); if (ret) goto out; @@ -384,7 +384,7 @@ static void sgx_reclaim_pages(void) page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); mutex_lock(&encl_page->encl->lock); - ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); if (ret) { mutex_unlock(&encl_page->encl->lock); goto skip; @@ -475,6 +475,11 @@ static bool __init sgx_page_reclaimer_init(void) return true; } +bool current_is_ksgxd(void) +{ + return current == ksgxd_tsk; +} + static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { struct sgx_numa_node *node = &sgx_numa_nodes[nid]; -- Gitee From ff9066689a9ea0c1051e66e5b7275ee57f0ffcc4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:37 -0700 Subject: [PATCH 094/134] x86/sgx: Add short descriptions to ENCLS wrappers commit 4c3f73584c0c0152b75dd6a090558ada39601159 upstream. The SGX ENCLS instruction uses EAX to specify an SGX function and may require additional registers, depending on the SGX function. ENCLS invokes the specified privileged SGX function for managing and debugging enclaves. Macros are used to wrap the ENCLS functionality and several wrappers are used to wrap the macros to make the different SGX functions accessible in the code. The wrappers of the supported SGX functions are cryptic. Add short descriptions of each as a comment. Intel-SIG: commit 4c3f73584c0c x86/sgx: Add short descriptions to ENCLS wrappers. Backport for SGX EDMM support. Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/5e78a1126711cbd692d5b8132e0683873398f69e.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encls.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 9b204843b78d..dddeb9cfaebb 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -162,57 +162,71 @@ static inline bool encls_failed(int ret) ret; \ }) +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) { return __encls_2(ECREATE, pginfo, secs); } +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ static inline int __eextend(void *secs, void *addr) { return __encls_2(EEXTEND, secs, addr); } +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) { return __encls_2(EADD, pginfo, addr); } +/* Finalize enclave build, initialize enclave for user code execution. */ static inline int __einit(void *sigstruct, void *token, void *secs) { return __encls_ret_3(EINIT, sigstruct, secs, token); } +/* Disassociate EPC page from its enclave and mark it as unused. */ static inline int __eremove(void *addr) { return __encls_ret_1(EREMOVE, addr); } +/* Copy data to an EPC page belonging to a debug enclave. */ static inline int __edbgwr(void *addr, unsigned long *data) { return __encls_2(EDGBWR, *data, addr); } +/* Copy data from an EPC page belonging to a debug enclave. */ static inline int __edbgrd(void *addr, unsigned long *data) { return __encls_1_1(EDGBRD, *data, addr); } +/* Track that software has completed the required TLB address clears. */ static inline int __etrack(void *addr) { return __encls_ret_1(ETRACK, addr); } +/* Load, verify, and unblock an EPC page. */ static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(ELDU, pginfo, addr, va); } +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ static inline int __eblock(void *addr) { return __encls_ret_1(EBLOCK, addr); } +/* Initialize an EPC page into a Version Array (VA) page. */ static inline int __epa(void *addr) { unsigned long rbx = SGX_PAGE_TYPE_VA; @@ -220,6 +234,7 @@ static inline int __epa(void *addr) return __encls_2(EPA, rbx, addr); } +/* Invalidate an EPC page and write it out to main memory. */ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, void *va) { -- Gitee From 9d2052f60368b21cf25aaf0fc0772f284a0bbaa8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:38 -0700 Subject: [PATCH 095/134] x86/sgx: Add wrapper for SGX2 EMODPR function commit 0fb2126db8414e0206960febb3e4a116439c69dd upstream. Add a wrapper for the EMODPR ENCLS leaf function used to restrict enclave page permissions as maintained in the SGX hardware's Enclave Page Cache Map (EPCM). EMODPR: 1) Updates the EPCM permissions of an enclave page by treating the new permissions as a mask. Supplying a value that attempts to relax EPCM permissions has no effect on EPCM permissions (PR bit, see below, is changed). 2) Sets the PR bit in the EPCM entry of the enclave page to indicate that permission restriction is in progress. The bit is reset by the enclave by invoking ENCLU leaf function EACCEPT or EACCEPTCOPY. The enclave may access the page throughout the entire process if conforming to the EPCM permissions for the enclave page. After performing the permission restriction by issuing EMODPR the kernel needs to collaborate with the hardware to ensure that all logical processors sees the new restricted permissions. This is required for the enclave's EACCEPT/EACCEPTCOPY to succeed and is accomplished with the ETRACK flow. Expand enum sgx_return_code with the possible EMODPR return values. Intel-SIG: commit 0fb2126db841 x86/sgx: Add wrapper for SGX2 EMODPR function. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d15e7a769e13e4ca671fa2d0a0d3e3aec5aedbd4.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 5 +++++ arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index a16e2c9154a3..0b34a4af66f3 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -47,17 +47,22 @@ enum sgx_encls_function { /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it + * is in the PENDING or MODIFIED state. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { + SGX_EPC_PAGE_CONFLICT = 7, SGX_NOT_TRACKED = 11, SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, + SGX_PAGE_NOT_MODIFIABLE = 20, SGX_UNMASKED_EVENT = 128, }; diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index dddeb9cfaebb..427745b30887 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -241,4 +241,10 @@ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, return __encls_ret_3(EWB, pginfo, addr, va); } +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + #endif /* _X86_ENCLS_H */ -- Gitee From 19073de784fcf119339507f5aa0c6350b45f4410 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:39 -0700 Subject: [PATCH 096/134] x86/sgx: Add wrapper for SGX2 EMODT function commit 09b38d0b412dbf8922b3dc33103c1a1257519ab9 upstream. Add a wrapper for the EMODT ENCLS leaf function used to change the type of an enclave page as maintained in the SGX hardware's Enclave Page Cache Map (EPCM). EMODT: 1) Updates the EPCM page type of the enclave page. 2) Sets the MODIFIED bit in the EPCM entry of the enclave page. This bit is reset by the enclave by invoking ENCLU leaf function EACCEPT or EACCEPTCOPY. Access from within the enclave to the enclave page is not possible while the MODIFIED bit is set. After changing the enclave page type by issuing EMODT the kernel needs to collaborate with the hardware to ensure that no logical processor continues to hold a reference to the changed page. This is required to ensure no required security checks are circumvented and is required for the enclave's EACCEPT/EACCEPTCOPY to succeed. Ensuring that no references to the changed page remain is accomplished with the ETRACK flow. Intel-SIG: commit 09b38d0b412d x86/sgx: Add wrapper for SGX2 EMODT function. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/dba63a8c0db1d510b940beee1ba2a8207efeb1f1.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 427745b30887..cdd34508a6b0 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -247,4 +247,10 @@ static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) return __encls_ret_2(EMODPR, secinfo, addr); } +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + #endif /* _X86_ENCLS_H */ -- Gitee From 7322d788a1d4bbf366cb63e85cab9468a97aef0f Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:40 -0700 Subject: [PATCH 097/134] x86/sgx: Add wrapper for SGX2 EAUG function commit 61416b294af02e4747554c0d1b28d436a4a537d2 upstream. Add a wrapper for the EAUG ENCLS leaf function used to add a page to an initialized enclave. EAUG: 1) Stores all properties of the new enclave page in the SGX hardware's Enclave Page Cache Map (EPCM). 2) Sets the PENDING bit in the EPCM entry of the enclave page. This bit is cleared by the enclave by invoking ENCLU leaf function EACCEPT or EACCEPTCOPY. Access from within the enclave to the new enclave page is not possible until the PENDING bit is cleared. Intel-SIG: commit 61416b294af0 x86/sgx: Add wrapper for SGX2 EAUG function. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/97a46754fe4764e908651df63694fb760f783d6e.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index cdd34508a6b0..a18d750a6bdd 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -253,4 +253,10 @@ static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) return __encls_ret_2(EMODT, secinfo, addr); } +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + #endif /* _X86_ENCLS_H */ -- Gitee From 3363e7113e5fb58410899e717ecb7266b9b0b271 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:41 -0700 Subject: [PATCH 098/134] x86/sgx: Support loading enclave page without VMA permissions check commit b3fb517dc6020fec85c82171a909da10c6a6f90a upstream. sgx_encl_load_page() is used to find and load an enclave page into enclave (EPC) memory, potentially loading it from the backing storage. Both usages of sgx_encl_load_page() are during an access to the enclave page from a VMA and thus the permissions of the VMA are considered before the enclave page is loaded. SGX2 functions operating on enclave pages belonging to an initialized enclave requiring the page to be in EPC. It is thus required to support loading enclave pages into the EPC independent from a VMA. Split the current sgx_encl_load_page() to support the two usages: A new call, sgx_encl_load_page_in_vma(), behaves exactly like the current sgx_encl_load_page() that takes VMA permissions into account, while sgx_encl_load_page() just loads an enclave page into EPC. VMA, PTE, and EPCM permissions continue to dictate whether the pages can be accessed from within an enclave. Intel-SIG: commit b3fb517dc602 x86/sgx: Support loading enclave page without VMA permissions check. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d4393513c1f18987c14a490bcf133bfb71a5dc43.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++------------ arch/x86/kernel/cpu/sgx/encl.h | 2 ++ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 8488e0867c95..6fff090e979e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -232,25 +232,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } -static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr, - unsigned long vm_flags) +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); struct sgx_epc_page *epc_page; - struct sgx_encl_page *entry; - - entry = xa_load(&encl->page_array, PFN_DOWN(addr)); - if (!entry) - return ERR_PTR(-EFAULT); - - /* - * Verify that the faulted page has equal or higher build time - * permissions than the VMA permissions (i.e. the subset of {VM_READ, - * VM_WRITE, VM_EXECUTE} in vma->vm_flags). - */ - if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) - return ERR_PTR(-EFAULT); /* Entry successfully located. */ if (entry->epc_page) { @@ -276,6 +261,40 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, return entry; } +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) { unsigned long addr = (unsigned long)vmf->address; @@ -297,7 +316,7 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); if (IS_ERR(entry)) { mutex_unlock(&encl->lock); @@ -445,7 +464,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, for ( ; ; ) { mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); if (PTR_ERR(entry) != -EBUSY) break; diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 332ef3568267..a4894cf3f56a 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -119,5 +119,7 @@ unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); #endif /* _X86_ENCL_H */ -- Gitee From 1bdf1500305d4a352b10f71dbac4427939678734 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:42 -0700 Subject: [PATCH 099/134] x86/sgx: Export sgx_encl_ewb_cpumask() commit 7f391752d4adac10cfc1e5d7a76bab0ab5c9c9d4 upstream. Using sgx_encl_ewb_cpumask() to learn which CPUs might have executed an enclave is useful to ensure that TLBs are cleared when changes are made to enclave pages. sgx_encl_ewb_cpumask() is used within the reclaimer when an enclave page is evicted. The upcoming SGX2 support enables changes to be made to enclave pages and will require TLBs to not refer to the changed pages and thus will be needing sgx_encl_ewb_cpumask(). Relocate sgx_encl_ewb_cpumask() to be with the rest of the enclave code in encl.c now that it is no longer unique to the reclaimer. Take care to ensure that any future usage maintains the current context requirement that ETRACK has been called first. Expand the existing comments to highlight this while moving them to a more prominent location before the function. No functional change. Intel-SIG: commit 7f391752d4ad x86/sgx: Export sgx_encl_ewb_cpumask(). Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/05b60747fd45130cf9fc6edb1c373a69a18a22c5.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 67 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/sgx/encl.h | 1 + arch/x86/kernel/cpu/sgx/main.c | 29 --------------- 3 files changed, 68 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 6fff090e979e..a16b39230eb7 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -714,6 +714,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) return 0; } +/** + * sgx_encl_ewb_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_ewb_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index) { diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index a4894cf3f56a..2d8fe77183f8 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -106,6 +106,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl); int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 97877d2650be..2288fc100d64 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -205,35 +205,6 @@ static void sgx_ipi_cb(void *info) { } -static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) -{ - cpumask_t *cpumask = &encl->cpumask; - struct sgx_encl_mm *encl_mm; - int idx; - - /* - * Can race with sgx_encl_mm_add(), but ETRACK has already been - * executed, which means that the CPUs running in the new mm will enter - * into the enclave with a fresh epoch. - */ - cpumask_clear(cpumask); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); - - mmput_async(encl_mm->mm); - } - - srcu_read_unlock(&encl->srcu, idx); - - return cpumask; -} - /* * Swap page to the regular memory transformed to the blocked state by using * EBLOCK, which means that it can no loger be referenced (no new TLB entries). -- Gitee From 28897a049e49a5440f10d6d59481c5f4fedb7c39 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:43 -0700 Subject: [PATCH 100/134] x86/sgx: Rename sgx_encl_ewb_cpumask() as sgx_encl_cpumask() commit bdaa8799f697daa059bf807da40a9444de94d7e3 upstream. sgx_encl_ewb_cpumask() is no longer unique to the reclaimer where it is used during the EWB ENCLS leaf function when EPC pages are written out to main memory and sgx_encl_ewb_cpumask() is used to learn which CPUs might have executed the enclave to ensure that TLBs are cleared. Upcoming SGX2 enabling will use sgx_encl_ewb_cpumask() during the EMODPR and EMODT ENCLS leaf functions that make changes to enclave pages. The function is needed for the same reason it is used now: to learn which CPUs might have executed the enclave to ensure that TLBs no longer point to the changed pages. Rename sgx_encl_ewb_cpumask() to sgx_encl_cpumask() to reflect the broader usage. Intel-SIG: commit bdaa8799f697 x86/sgx: Rename sgx_encl_ewb_cpumask() as sgx_encl_cpumask(). Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d4d08c449450a13d8dd3bb6c2b1af03895586d4f.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 6 +++--- arch/x86/kernel/cpu/sgx/encl.h | 2 +- arch/x86/kernel/cpu/sgx/main.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index a16b39230eb7..3c7a064ff24b 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -715,7 +715,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) } /** - * sgx_encl_ewb_cpumask() - Query which CPUs might be accessing the enclave + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave * @encl: the enclave * * Some SGX functions require that no cached linear-to-physical address @@ -740,7 +740,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) * The following flow is used to support SGX functions that require that * no cached linear-to-physical address mappings are present: * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. - * 2) Use this function (sgx_encl_ewb_cpumask()) to query which CPUs might be + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be * accessing the enclave. * 3) Send IPI to identified CPUs, kicking them out of the enclave and * thus flushing all locally cached linear-to-physical address mappings. @@ -757,7 +757,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) * * Return: cpumask of CPUs that might be accessing @encl */ -const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) { cpumask_t *cpumask = &encl->cpumask; struct sgx_encl_mm *encl_mm; diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 2d8fe77183f8..78b8f934de69 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -106,7 +106,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); -const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 2288fc100d64..d3ef6d451ae8 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -251,7 +251,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, * miss cpus that entered the enclave between * generating the mask and incrementing epoch. */ - on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ret = __sgx_encl_ewb(epc_page, va_slot, backing); } -- Gitee From d1f8f8616638a57533f43f46e3c1bab5015f61e6 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:44 -0700 Subject: [PATCH 101/134] x86/sgx: Move PTE zap code to new sgx_zap_enclave_ptes() commit f89c2f9bf5a64f619de06ded4349dff5a35da860 upstream. The SGX reclaimer removes page table entries pointing to pages that are moved to swap. SGX2 enables changes to pages belonging to an initialized enclave, thus enclave pages may have their permission or type changed while the page is being accessed by an enclave. Supporting SGX2 requires page table entries to be removed so that any cached mappings to changed pages are removed. For example, with the ability to change enclave page types a regular enclave page may be changed to a Thread Control Structure (TCS) page that may not be accessed by an enclave. Factor out the code removing page table entries to a separate function sgx_zap_enclave_ptes(), fixing accuracy of comments in the process, and make it available to the upcoming SGX2 code. Place sgx_zap_enclave_ptes() with the rest of the enclave code in encl.c interacting with the page table since this code is no longer unique to the reclaimer. Intel-SIG: commit f89c2f9bf5a6 x86/sgx: Move PTE zap code to new sgx_zap_enclave_ptes(). Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/b010cdf01d7ce55dd0f00e883b7ccbd9db57160a.1652137848.git.reinette.chatre@intel.com [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 45 +++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/sgx/encl.h | 2 +- arch/x86/kernel/cpu/sgx/main.c | 31 ++--------------------- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 3c7a064ff24b..2e44c7039cdc 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -706,7 +706,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) spin_lock(&encl->mm_lock); list_add_rcu(&encl_mm->list, &encl->mm_list); - /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ smp_wmb(); encl->mm_list_version++; spin_unlock(&encl->mm_lock); @@ -988,6 +988,49 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + /** * sgx_alloc_va_page() - Allocate a Version Array (VA) page * diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 78b8f934de69..5ca4cb9bb4fe 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -114,7 +114,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); - +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index d3ef6d451ae8..c0332bd839b5 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) struct sgx_encl_page *page = epc_page->owner; unsigned long addr = page->desc & PAGE_MASK; struct sgx_encl *encl = page->encl; - unsigned long mm_list_version; - struct sgx_encl_mm *encl_mm; - struct vm_area_struct *vma; - int idx, ret; - - do { - mm_list_version = encl->mm_list_version; - - /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ - smp_rmb(); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - down_read(&encl_mm->mm->mmap_sem); - - ret = sgx_encl_find(encl_mm->mm, addr, &vma); - if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); - - up_read(&encl_mm->mm->mmap_sem); - - mmput_async(encl_mm->mm); - } + int ret; - srcu_read_unlock(&encl->srcu, idx); - } while (unlikely(encl->mm_list_version != mm_list_version)); + sgx_zap_enclave_ptes(encl, addr); mutex_lock(&encl->lock); -- Gitee From c17ac2770b6487d95d9747c258ce058d466cbfe4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:45 -0700 Subject: [PATCH 102/134] x86/sgx: Make sgx_ipi_cb() available internally commit c7c6a8a61b0066ba7e891783032dc2a7873c6dc7 upstream. The ETRACK function followed by an IPI to all CPUs within an enclave is a common pattern with more frequent use in support of SGX2. Make the (empty) IPI callback function available internally in preparation for usage by SGX2. Intel-SIG: commit c7c6a8a61b00 x86/sgx: Make sgx_ipi_cb() available internally. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/1179ed4a9c3c1c2abf49d51bfcf2c30b493181cc.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 2 +- arch/x86/kernel/cpu/sgx/sgx.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index c0332bd839b5..6f86c184a144 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -174,7 +174,7 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, return ret; } -static void sgx_ipi_cb(void *info) +void sgx_ipi_cb(void *info) { } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 0f17def9fe6f..b30cee4de903 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -90,6 +90,8 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +void sgx_ipi_cb(void *info); + #ifdef CONFIG_X86_SGX_KVM int __init sgx_vepc_init(void); #else -- Gitee From 63c7b1945eb17e1787f67c96eea60e34afcfaf5b Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:46 -0700 Subject: [PATCH 103/134] x86/sgx: Create utility to validate user provided offset and length commit dda03e2c331b9fc7bbc8fc0de12a6d92d8c18661 upstream. User provided offset and length is validated when parsing the parameters of the SGX_IOC_ENCLAVE_ADD_PAGES ioctl(). Extract this validation (with consistent use of IS_ALIGNED) into a utility that can be used by the SGX2 ioctl()s that will also provide these values. Intel-SIG: commit dda03e2c331b x86/sgx: Create utility to validate user provided offset and length. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/767147bc100047abed47fe27c592901adfbb93a2.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/ioctl.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index ba716f819bd4..0814085c18fa 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -372,6 +372,26 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, return ret; } +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + /** * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES * @encl: an enclave pointer @@ -425,14 +445,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&add_arg, arg, sizeof(add_arg))) return -EFAULT; - if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || - !IS_ALIGNED(add_arg.src, PAGE_SIZE)) - return -EINVAL; - - if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) return -EINVAL; - if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) return -EINVAL; if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, -- Gitee From 749848da0b2876075925a183ab0301193a781f4e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:47 -0700 Subject: [PATCH 104/134] x86/sgx: Keep record of SGX page type commit 8cb7b502f31e6cc4c6ebe2c5eeaa90dcab418cf1 upstream. SGX2 functions are not allowed on all page types. For example, ENCLS[EMODPR] is only allowed on regular SGX enclave pages and ENCLS[EMODPT] is only allowed on TCS and regular pages. If these functions are attempted on another type of page the hardware would trigger a fault. Keep a record of the SGX page type so that there is more certainty whether an SGX2 instruction can succeed and faults can be treated as real failures. The page type is a property of struct sgx_encl_page and thus does not cover the VA page type. VA pages are maintained in separate structures and their type can be determined in a different way. The SGX2 instructions needing the page type do not operate on VA pages and this is thus not a scenario needing to be covered at this time. struct sgx_encl_page hosting this information is maintained for each enclave page so the space consumed by the struct is important. The existing sgx_encl_page->vm_max_prot_bits is already unsigned long while only using three bits. Transition to a bitfield for the two members to support the additional information without increasing the space consumed by the struct. Intel-SIG: commit 8cb7b502f31e x86/sgx: Keep record of SGX page type. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/a0a6939eefe7ba26514f6c49723521cde372de64.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 3 +++ arch/x86/kernel/cpu/sgx/encl.h | 3 ++- arch/x86/kernel/cpu/sgx/ioctl.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 0b34a4af66f3..460201cbd857 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -221,6 +221,9 @@ struct sgx_pageinfo { * %SGX_PAGE_TYPE_REG: a regular page * %SGX_PAGE_TYPE_VA: a VA page * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + * + * Make sure when making changes to this enum that its values can still fit + * in the bitfield within &struct sgx_encl_page */ enum sgx_page_type { SGX_PAGE_TYPE_SECS, diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 5ca4cb9bb4fe..19e7f661de7e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -27,7 +27,8 @@ struct sgx_encl_page { unsigned long desc; - unsigned long vm_max_prot_bits; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; struct sgx_epc_page *epc_page; struct sgx_encl *encl; struct sgx_va_page *va_page; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 0814085c18fa..3eb2e062f801 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) set_bit(SGX_ENCL_DEBUG, &encl->flags); encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; @@ -344,6 +345,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, */ encl_page->encl = encl; encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; encl->secs_child_cnt++; if (flags & SGX_PAGE_MEASURE) { -- Gitee From 6d6d77199042f75ae794184506fa4e4cca1b43ef Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:48 -0700 Subject: [PATCH 105/134] x86/sgx: Export sgx_encl_{grow,shrink}() commit 3a5351415228d06c988a1e610e71d3889f707ac9 upstream. In order to use sgx_encl_{grow,shrink}() in the page augmentation code located in encl.c, export these functions. Intel-SIG: commit 3a5351415228 x86/sgx: Export sgx_encl_{grow,shrink}(). Backport for SGX EDMM support. Suggested-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d51730acf54b6565710b2261b3099517b38c2ec4.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.h | 2 ++ arch/x86/kernel/cpu/sgx/ioctl.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 19e7f661de7e..bbcc9b8620cd 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -123,5 +123,7 @@ bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 3eb2e062f801..633e16af05fe 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -17,7 +17,7 @@ #include "encl.h" #include "encls.h" -static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) { struct sgx_va_page *va_page = NULL; void *err; @@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) return va_page; } -static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) { encl->page_cnt--; -- Gitee From 737b085839561025ef5491ff66a4e57ada325a5d Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 10 May 2022 11:08:49 -0700 Subject: [PATCH 106/134] x86/sgx: Export sgx_encl_page_alloc() commit 8123073c4335fcd18ea5e049b85220f122ac1ca3 upstream. Move sgx_encl_page_alloc() to encl.c and export it so that it can be used in the implementation for support of adding pages to initialized enclaves, which requires to allocate new enclave pages. Intel-SIG: commit 8123073c4335 x86/sgx: Export sgx_encl_page_alloc(). Backport for SGX EDMM support. Signed-off-by: Jarkko Sakkinen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/57ae71b4ea17998467670232e12d6617b95c6811.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/sgx/encl.h | 3 +++ arch/x86/kernel/cpu/sgx/ioctl.c | 32 -------------------------------- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2e44c7039cdc..efd006fe0547 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -988,6 +988,38 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + /** * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave * @encl: the enclave diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index bbcc9b8620cd..c64df914c676 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,6 +115,9 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 633e16af05fe..8617f8e80c2c 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -169,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) return ret; } -static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, - unsigned long offset, - u64 secinfo_flags) -{ - struct sgx_encl_page *encl_page; - unsigned long prot; - - encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); - if (!encl_page) - return ERR_PTR(-ENOMEM); - - encl_page->desc = encl->base + offset; - encl_page->encl = encl; - - prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); - - /* - * TCS pages must always RW set for CPU access while the SECINFO - * permissions are *always* zero - the CPU ignores the user provided - * values and silently overwrites them with zero permissions. - */ - if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) - prot |= PROT_READ | PROT_WRITE; - - /* Calculate maximum of the VM flags for the page. */ - encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); - - return encl_page; -} - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) { u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; -- Gitee From bee13694b8393ef3f6e1a626bb8a576ebf5772c3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:50 -0700 Subject: [PATCH 107/134] x86/sgx: Support VA page allocation without reclaiming commit a76e7f1f18884a94998ca82862c0a4e6d0fd2933 upstream. struct sgx_encl should be protected with the mutex sgx_encl->lock. One exception is sgx_encl->page_cnt that is incremented (in sgx_encl_grow()) when an enclave page is added to the enclave. The reason the mutex is not held is to allow the reclaimer to be called directly if there are no EPC pages (in support of a new VA page) available at the time. Incrementing sgx_encl->page_cnt without sgc_encl->lock held is currently (before SGX2) safe from concurrent updates because all paths in which sgx_encl_grow() is called occur before enclave initialization and are protected with an atomic operation on SGX_ENCL_IOCTL. SGX2 includes support for dynamically adding pages after enclave initialization where the protection of SGX_ENCL_IOCTL is not available. Make direct reclaim of EPC pages optional when new VA pages are added to the enclave. Essentially the existing "reclaim" flag used when regular EPC pages are added to an enclave becomes available to the caller when used to allocate VA pages instead of always being "true". When adding pages without invoking the reclaimer it is possible to do so with sgx_encl->lock held, gaining its protection against concurrent updates to sgx_encl->page_cnt after enclave initialization. No functional change. Intel-SIG: commit a76e7f1f1888 x86/sgx: Support VA page allocation without reclaiming. Backport for SGX EDMM support. Reported-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/42c5934c229982ee67982bb97c6ab34bde758620.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 6 ++++-- arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- arch/x86/kernel/cpu/sgx/ioctl.c | 8 ++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index efd006fe0547..89a99cd86561 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -1065,6 +1065,8 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) /** * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. * * Allocate a free EPC page and convert it to a Version Array (VA) page. * @@ -1072,12 +1074,12 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) * a VA page, * -errno otherwise */ -struct sgx_epc_page *sgx_alloc_va_page(void) +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) { struct sgx_epc_page *epc_page; int ret; - epc_page = sgx_alloc_epc_page(NULL, true); + epc_page = sgx_alloc_epc_page(NULL, reclaim); if (IS_ERR(epc_page)) return ERR_CAST(epc_page); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index c64df914c676..8f809959baa9 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -119,14 +119,14 @@ struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, unsigned long offset, u64 secinfo_flags); void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); -struct sgx_epc_page *sgx_alloc_va_page(void); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, unsigned long addr); -struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 8617f8e80c2c..74deaf51414c 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -17,7 +17,7 @@ #include "encl.h" #include "encls.h" -struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) { struct sgx_va_page *va_page = NULL; void *err; @@ -30,7 +30,7 @@ struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) if (!va_page) return ERR_PTR(-ENOMEM); - va_page->epc_page = sgx_alloc_va_page(); + va_page->epc_page = sgx_alloc_va_page(reclaim); if (IS_ERR(va_page->epc_page)) { err = ERR_CAST(va_page->epc_page); kfree(va_page); @@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); else if (va_page) @@ -275,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, return PTR_ERR(epc_page); } - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) { ret = PTR_ERR(va_page); goto err_out_free; -- Gitee From 2a68ed112e563c7d87b6b4a8844aa72cb56cc7a3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:51 -0700 Subject: [PATCH 108/134] x86/sgx: Support restricting of enclave page permissions commit ff08530a5232aab3b610db44cdc5045d26421911 upstream. In the initial (SGX1) version of SGX, pages in an enclave need to be created with permissions that support all usages of the pages, from the time the enclave is initialized until it is unloaded. For example, pages used by a JIT compiler or when code needs to otherwise be relocated need to always have RWX permissions. SGX2 includes a new function ENCLS[EMODPR] that is run from the kernel and can be used to restrict the EPCM permissions of regular enclave pages within an initialized enclave. Introduce ioctl() SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS to support restricting EPCM permissions. With this ioctl() the user specifies a page range and the EPCM permissions to be applied to all pages in the provided range. ENCLS[EMODPR] is run to restrict the EPCM permissions followed by the ENCLS[ETRACK] flow that will ensure no cached linear-to-physical address mappings to the changed pages remain. It is possible for the permission change request to fail on any page within the provided range, either with an error encountered by the kernel or by the SGX hardware while running ENCLS[EMODPR]. To support partial success the ioctl() returns an error code based on failures encountered by the kernel as well as two result output parameters: one for the number of pages that were successfully changed and one for the SGX return code. The page table entry permissions are not impacted by the EPCM permission changes. VMAs and PTEs will continue to allow the maximum vetted permissions determined at the time the pages are added to the enclave. The SGX error code in a page fault will indicate if it was an EPCM permission check that prevented an access attempt. No checking is done to ensure that the permissions are actually being restricted. This is because the enclave may have relaxed the EPCM permissions from within the enclave without the kernel knowing. An attempt to relax permissions using this call will be ignored by the hardware. Intel-SIG: commit ff08530a5232 x86/sgx: Support restricting of enclave page permissions. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Tested-by: Haitao Huang Tested-by: Vijay Dhanraj Link: https://lkml.kernel.org/r/082cee986f3c1a2f4fdbf49501d7a8c5a98446f8.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/uapi/asm/sgx.h | 21 ++++ arch/x86/kernel/cpu/sgx/ioctl.c | 216 ++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 95f2b40a55cd..2b898da3d2de 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -29,6 +29,8 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) #define SGX_IOC_VEPC_REMOVE_ALL \ _IO(SGX_MAGIC, 0x04) +#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ + _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) /** * struct sgx_enclave_create - parameter structure for the @@ -76,6 +78,25 @@ struct sgx_enclave_provision { __u64 fd; }; +/** + * struct sgx_enclave_restrict_permissions - parameters for ioctl + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @permissions:new permission bits for pages in range described by @offset + * and @length + * @result: (output) SGX result code of ENCLS[EMODPR] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_restrict_permissions { + __u64 offset; + __u64 length; + __u64 permissions; + __u64 result; + __u64 count; +}; + struct sgx_enclave_run; /** diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 74deaf51414c..89b0cc62f5f4 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -660,6 +660,218 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) return sgx_set_attribute(&encl->attributes_mask, params.fd); } +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -681,6 +893,10 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) case SGX_IOC_ENCLAVE_PROVISION: ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; -- Gitee From fdab9b5e9553fdad60bd6ad4f83688efb8920780 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:52 -0700 Subject: [PATCH 109/134] x86/sgx: Support adding of pages to an initialized enclave commit 5a90d2c3f5ef87717e54572af8426aba6fdbdaa6 upstream. With SGX1 an enclave needs to be created with its maximum memory demands allocated. Pages cannot be added to an enclave after it is initialized. SGX2 introduces a new function, ENCLS[EAUG], that can be used to add pages to an initialized enclave. With SGX2 the enclave still needs to set aside address space for its maximum memory demands during enclave creation, but all pages need not be added before enclave initialization. Pages can be added during enclave runtime. Add support for dynamically adding pages to an initialized enclave, architecturally limited to RW permission at creation but allowed to obtain RWX permissions after trusted enclave runs EMODPE. Add pages via the page fault handler at the time an enclave address without a backing enclave page is accessed, potentially directly reclaiming pages if no free pages are available. The enclave is still required to run ENCLU[EACCEPT] on the page before it can be used. A useful flow is for the enclave to run ENCLU[EACCEPT] on an uninitialized address. This will trigger the page fault handler that will add the enclave page and return execution to the enclave to repeat the ENCLU[EACCEPT] instruction, this time successful. If the enclave accesses an uninitialized address in another way, for example by expanding the enclave stack to a page that has not yet been added, then the page fault handler would add the page on the first write but upon returning to the enclave the instruction that triggered the page fault would be repeated and since ENCLU[EACCEPT] was not run yet it would trigger a second page fault, this time with the SGX flag set in the page fault error code. This can only be recovered by entering the enclave again and directly running the ENCLU[EACCEPT] instruction on the now initialized address. Accessing an uninitialized address from outside the enclave also triggers this flow but the page will remain inaccessible (access will result in #PF) until accepted from within the enclave via ENCLU[EACCEPT]. Intel-SIG: commit 5a90d2c3f5ef x86/sgx: Support adding of pages to an initialized enclave. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Tested-by: Haitao Huang Tested-by: Vijay Dhanraj Link: https://lkml.kernel.org/r/a254a58eabea053803277449b24b6e4963a3883b.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 117 +++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 89a99cd86561..4cf0ada6857b 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -295,6 +295,112 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, return __sgx_encl_load_page(encl, entry); } +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) + goto err_out_epc; + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) { unsigned long addr = (unsigned long)vmf->address; @@ -314,6 +420,17 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) if (unlikely(!encl)) return VM_FAULT_SIGBUS; + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + mutex_lock(&encl->lock); entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); -- Gitee From 7854619ba4ffd659f1b17e656209c27702140c4a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:53 -0700 Subject: [PATCH 110/134] x86/sgx: Tighten accessible memory range after enclave initialization commit 7b013e723a1f689077347b30778d8831b6d92969 upstream. Before an enclave is initialized the enclave's memory range is unknown. The enclave's memory range is learned at the time it is created via the SGX_IOC_ENCLAVE_CREATE ioctl() where the provided memory range is obtained from an earlier mmap() of /dev/sgx_enclave. After an enclave is initialized its memory can be mapped into user space (mmap()) from where it can be entered at its defined entry points. With the enclave's memory range known after it is initialized there is no reason why it should be possible to map memory outside this range. Lock down access to the initialized enclave's memory range by denying any attempt to map memory outside its memory range. Locking down the memory range also makes adding pages to an initialized enclave more efficient. Pages are added to an initialized enclave by accessing memory that belongs to the enclave's memory range but not yet backed by an enclave page. If it is possible for user space to map memory that does not form part of the enclave then an access to this memory would eventually fail. Failures range from a prompt general protection fault if the access was an ENCLU[EACCEPT] from within the enclave, or a page fault via the vDSO if it was another access from within the enclave, or a SIGBUS (also resulting from a page fault) if the access was from outside the enclave. Disallowing invalid memory to be mapped in the first place avoids preventable failures. Intel-SIG: commit 7b013e723a1f x86/sgx: Tighten accessible memory range after enclave initialization. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/6391460d75ae79cea2e81eef0f6ffc03c6e9cfe7.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 4cf0ada6857b..b13d1c5f2bc7 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -503,6 +503,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + /* * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might * conflict with the enclave page permissions. -- Gitee From 540ba6a9f69338ce73be2adb7d0bfc012b9e5e52 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:54 -0700 Subject: [PATCH 111/134] x86/sgx: Support modifying SGX page type commit 45d546b8c109d69f6659d58b2ace005b2f07f557 upstream. Every enclave contains one or more Thread Control Structures (TCS). The TCS contains meta-data used by the hardware to save and restore thread specific information when entering/exiting the enclave. With SGX1 an enclave needs to be created with enough TCSs to support the largest number of threads expecting to use the enclave and enough enclave pages to meet all its anticipated memory demands. In SGX1 all pages remain in the enclave until the enclave is unloaded. SGX2 introduces a new function, ENCLS[EMODT], that is used to change the type of an enclave page from a regular (SGX_PAGE_TYPE_REG) enclave page to a TCS (SGX_PAGE_TYPE_TCS) page or change the type from a regular (SGX_PAGE_TYPE_REG) or TCS (SGX_PAGE_TYPE_TCS) page to a trimmed (SGX_PAGE_TYPE_TRIM) page (setting it up for later removal). With the existing support of dynamically adding regular enclave pages to an initialized enclave and changing the page type to TCS it is possible to dynamically increase the number of threads supported by an enclave. Changing the enclave page type to SGX_PAGE_TYPE_TRIM is the first step of dynamically removing pages from an initialized enclave. The complete page removal flow is: 1) Change the type of the pages to be removed to SGX_PAGE_TYPE_TRIM using the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() introduced here. 2) Approve the page removal by running ENCLU[EACCEPT] from within the enclave. 3) Initiate actual page removal using the ioctl() introduced in the following patch. Add ioctl() SGX_IOC_ENCLAVE_MODIFY_TYPES to support changing SGX enclave page types within an initialized enclave. With SGX_IOC_ENCLAVE_MODIFY_TYPES the user specifies a page range and the enclave page type to be applied to all pages in the provided range. The ioctl() itself can return an error code based on failures encountered by the kernel. It is also possible for SGX specific failures to be encountered. Add a result output parameter to communicate the SGX return code. It is possible for the enclave page type change request to fail on any page within the provided range. Support partial success by returning the number of pages that were successfully changed. After the page type is changed the page continues to be accessible from the kernel perspective with page table entries and internal state. The page may be moved to swap. Any access until ENCLU[EACCEPT] will encounter a page fault with SGX flag set in error code. Intel-SIG: commit 45d546b8c109 x86/sgx: Support modifying SGX page type. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Tested-by: Haitao Huang Tested-by: Vijay Dhanraj Link: https://lkml.kernel.org/r/babe39318c5bf16fc65fbfb38896cdee72161575.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/uapi/asm/sgx.h | 20 ++++ arch/x86/kernel/cpu/sgx/ioctl.c | 202 ++++++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 2b898da3d2de..cec70b935416 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -31,6 +31,8 @@ enum sgx_page_flags { _IO(SGX_MAGIC, 0x04) #define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) +#define SGX_IOC_ENCLAVE_MODIFY_TYPES \ + _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) /** * struct sgx_enclave_create - parameter structure for the @@ -97,6 +99,24 @@ struct sgx_enclave_restrict_permissions { __u64 count; }; +/** + * struct sgx_enclave_modify_types - parameters for ioctl + * %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @page_type: new type for pages in range described by @offset and @length + * @result: (output) SGX result code of ENCLS[EMODT] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_modify_types { + __u64 offset; + __u64 length; + __u64 page_type; + __u64 result; + __u64 count; +}; + struct sgx_enclave_run; /** diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 89b0cc62f5f4..2cec026ef3e8 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -872,6 +872,205 @@ static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, return ret; } +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -897,6 +1096,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) ret = sgx_ioc_enclave_restrict_permissions(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; -- Gitee From 504d1a810cc8b91ad48fe03ee461e2e3cc1c2174 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:55 -0700 Subject: [PATCH 112/134] x86/sgx: Support complete page removal commit 9849bb27152c18e8531424c0a8ef5f51ece40aea upstream. The SGX2 page removal flow was introduced in previous patch and is as follows: 1) Change the type of the pages to be removed to SGX_PAGE_TYPE_TRIM using the ioctl() SGX_IOC_ENCLAVE_MODIFY_TYPES introduced in previous patch. 2) Approve the page removal by running ENCLU[EACCEPT] from within the enclave. 3) Initiate actual page removal using the ioctl() SGX_IOC_ENCLAVE_REMOVE_PAGES introduced here. Support the final step of the SGX2 page removal flow with ioctl() SGX_IOC_ENCLAVE_REMOVE_PAGES. With this ioctl() the user specifies a page range that should be removed. All pages in the provided range should have the SGX_PAGE_TYPE_TRIM page type and the request will fail with EPERM (Operation not permitted) if a page that does not have the correct type is encountered. Page removal can fail on any page within the provided range. Support partial success by returning the number of pages that were successfully removed. Since actual page removal will succeed even if ENCLU[EACCEPT] was not run from within the enclave the ENCLU[EMODPR] instruction with RWX permissions is used as a no-op mechanism to ensure ENCLU[EACCEPT] was successfully run from within the enclave before the enclave page is removed. If the user omits running SGX_IOC_ENCLAVE_REMOVE_PAGES the pages will still be removed when the enclave is unloaded. Intel-SIG: commit 9849bb27152c x86/sgx: Support complete page removal. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Tested-by: Vijay Dhanraj Tested-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/b75ee93e96774e38bb44a24b8e9bbfb67b08b51b.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/uapi/asm/sgx.h | 21 +++++ arch/x86/kernel/cpu/sgx/ioctl.c | 145 ++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index cec70b935416..d1eb78af0023 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -33,6 +33,8 @@ enum sgx_page_flags { _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) #define SGX_IOC_ENCLAVE_MODIFY_TYPES \ _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) +#define SGX_IOC_ENCLAVE_REMOVE_PAGES \ + _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages) /** * struct sgx_enclave_create - parameter structure for the @@ -117,6 +119,25 @@ struct sgx_enclave_modify_types { __u64 count; }; +/** + * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @count: (output) bytes successfully changed (multiple of page size) + * + * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized + * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES + * ioctl() should be used to change the page type to PT_TRIM. After that + * succeeds ENCLU[EACCEPT] should be run from within the enclave and then + * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal. + */ +struct sgx_enclave_remove_pages { + __u64 offset; + __u64 length; + __u64 count; +}; + struct sgx_enclave_run; /** diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 2cec026ef3e8..065cc8d44055 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -1071,6 +1071,148 @@ static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, return ret; } +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -1099,6 +1241,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) case SGX_IOC_ENCLAVE_MODIFY_TYPES: ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; -- Gitee From 83fbe7e91eb0983e9c6f6787061fe3bdec90318e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:56 -0700 Subject: [PATCH 113/134] x86/sgx: Free up EPC pages directly to support large page ranges commit a0506b3b063641f0a05b2a4399442a38aad22291 upstream. The page reclaimer ensures availability of EPC pages across all enclaves. In support of this it runs independently from the individual enclaves in order to take locks from the different enclaves as it writes pages to swap. When needing to load a page from swap an EPC page needs to be available for its contents to be loaded into. Loading an existing enclave page from swap does not reclaim EPC pages directly if none are available, instead the reclaimer is woken when the available EPC pages are found to be below a watermark. When iterating over a large number of pages in an oversubscribed environment there is a race between the reclaimer woken up and EPC pages reclaimed fast enough for the page operations to proceed. Ensure there are EPC pages available before attempting to load a page that may potentially be pulled from swap into an available EPC page. Intel-SIG: commit a0506b3b0636 x86/sgx: Free up EPC pages directly to support large page ranges. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/a0d8f037c4a075d56bf79f432438412985f7ff7a.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/ioctl.c | 6 ++++++ arch/x86/kernel/cpu/sgx/main.c | 11 +++++++++++ arch/x86/kernel/cpu/sgx/sgx.h | 1 + 3 files changed, 18 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 065cc8d44055..c69a19e7817d 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -745,6 +745,8 @@ sgx_enclave_restrict_permissions(struct sgx_encl *encl, for (c = 0 ; c < modp->length; c += PAGE_SIZE) { addr = encl->base + modp->offset + c; + sgx_reclaim_direct(); + mutex_lock(&encl->lock); entry = sgx_encl_load_page(encl, addr); @@ -910,6 +912,8 @@ static long sgx_enclave_modify_types(struct sgx_encl *encl, for (c = 0 ; c < modt->length; c += PAGE_SIZE) { addr = encl->base + modt->offset + c; + sgx_reclaim_direct(); + mutex_lock(&encl->lock); entry = sgx_encl_load_page(encl, addr); @@ -1096,6 +1100,8 @@ static long sgx_encl_remove_pages(struct sgx_encl *encl, for (c = 0 ; c < params->length; c += PAGE_SIZE) { addr = encl->base + params->offset + c; + sgx_reclaim_direct(); + mutex_lock(&encl->lock); entry = sgx_encl_load_page(encl, addr); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 6f86c184a144..ad1ee2df9508 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -375,6 +375,17 @@ static bool sgx_should_reclaim(unsigned long watermark) !list_empty(&sgx_active_page_list); } +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + static int ksgxd(void *p) { set_freezable(); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index b30cee4de903..0f2020653fba 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -86,6 +86,7 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) struct sgx_epc_page *__sgx_alloc_epc_page(void); void sgx_free_epc_page(struct sgx_epc_page *page); +void sgx_reclaim_direct(void); void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); -- Gitee From 4648255002829ceff33fd9232a4786f5e7593268 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:57 -0700 Subject: [PATCH 114/134] Documentation/x86: Introduce enclave runtime management section commit 629b5155d01b699e50ee63a3973402c64d0ac5d6 upstream. Enclave runtime management is introduced following the pattern of the section describing enclave building. Provide a brief summary of enclave runtime management, pointing to the functions implementing the ioctl()s that will contain details within their kernel-doc. Intel-SIG: commit 629b5155d01b Documentation/x86: Introduce enclave runtime management section. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/1da0b9a938b28e68e6870ebd5291490d680e700b.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- Documentation/x86/sgx.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index 265568a9292c..2bcbffacbed5 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -100,6 +100,21 @@ pages and establish enclave page permissions. sgx_ioc_enclave_init sgx_ioc_enclave_provision +Enclave runtime management +-------------------------- + +Systems supporting SGX2 additionally support changes to initialized +enclaves: modifying enclave page permissions and type, and dynamically +adding and removing of enclave pages. When an enclave accesses an address +within its address range that does not have a backing page then a new +regular page will be dynamically added to the enclave. The enclave is +still required to run EACCEPT on the new page before it can be used. + +.. kernel-doc:: arch/x86/kernel/cpu/sgx/ioctl.c + :functions: sgx_ioc_enclave_restrict_permissions + sgx_ioc_enclave_modify_types + sgx_ioc_enclave_remove_pages + Enclave vDSO ------------ -- Gitee From 519cb8302b6658f92989d27f1087270372efbfbc Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:58 -0700 Subject: [PATCH 115/134] selftests/sgx: Add test for EPCM permission changes commit 20404a808593a6812cb485bec16256e702ff94c3 upstream. EPCM permission changes could be made from within (to relax permissions) or out (to restrict permissions) the enclave. Kernel support is needed when permissions are restricted to be able to call the privileged ENCLS[EMODPR] instruction. EPCM permissions can be relaxed via ENCLU[EMODPE] from within the enclave but the enclave still depends on the kernel to install PTEs with the needed permissions. Add a test that exercises a few of the enclave page permission flows: 1) Test starts with a RW (from enclave and kernel perspective) enclave page that is mapped via a RW VMA. 2) Use the SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl() to restrict the enclave (EPCM) page permissions to read-only. 3) Run ENCLU[EACCEPT] from within the enclave to accept the new page permissions. 4) Attempt to write to the enclave page from within the enclave - this should fail with a page fault on the EPCM permissions since the page table entry continues to allow RW access. 5) Restore EPCM permissions to RW by running ENCLU[EMODPE] from within the enclave. 6) Attempt to write to the enclave page from within the enclave - this should succeed since both EPCM and PTE permissions allow this access. Intel-SIG: commit 20404a808593 selftests/sgx: Add test for EPCM permission changes. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/2617bf2b2d1e27ca1d0096e1192ae5896baf3f80.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 15 ++ tools/testing/selftests/sgx/main.c | 214 ++++++++++++++++++++++++ tools/testing/selftests/sgx/test_encl.c | 38 +++++ 3 files changed, 267 insertions(+) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 02d775789ea7..b638eb98c80c 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -24,6 +24,8 @@ enum encl_op_type { ENCL_OP_PUT_TO_ADDRESS, ENCL_OP_GET_FROM_ADDRESS, ENCL_OP_NOP, + ENCL_OP_EACCEPT, + ENCL_OP_EMODPE, ENCL_OP_MAX, }; @@ -53,4 +55,17 @@ struct encl_op_get_from_addr { uint64_t addr; }; +struct encl_op_eaccept { + struct encl_op_header header; + uint64_t epc_addr; + uint64_t flags; + uint64_t ret; +}; + +struct encl_op_emodpe { + struct encl_op_header header; + uint64_t epc_addr; + uint64_t flags; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 370c4995f7c4..9d5c5959ffb8 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -25,6 +25,18 @@ static const uint64_t MAGIC = 0x1122334455667788ULL; static const uint64_t MAGIC2 = 0x8877665544332211ULL; vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; +/* + * Security Information (SECINFO) data structure needed by a few SGX + * instructions (eg. ENCLU[EACCEPT] and ENCLU[EMODPE]) holds meta-data + * about an enclave page. &enum sgx_secinfo_page_state specifies the + * secinfo flags used for page state. + */ +enum sgx_secinfo_page_state { + SGX_SECINFO_PENDING = (1 << 3), + SGX_SECINFO_MODIFIED = (1 << 4), + SGX_SECINFO_PR = (1 << 5), +}; + struct vdso_symtab { Elf64_Sym *elf_symtab; const char *elf_symstrtab; @@ -554,4 +566,206 @@ TEST_F(enclave, pte_permissions) EXPECT_EQ(self->run.exception_addr, 0); } +/* + * Enclave page permission test. + * + * Modify and restore enclave page's EPCM (enclave) permissions from + * outside enclave (ENCLS[EMODPR] via kernel) as well as from within + * enclave (via ENCLU[EMODPE]). Check for page fault if + * VMA allows access but EPCM permissions do not. + */ +TEST_F(enclave, epcm_permissions) +{ + struct sgx_enclave_restrict_permissions restrict_ioc; + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_eaccept eaccept_op; + struct encl_op_emodpe emodpe_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Ensure kernel supports needed ioctl() and system supports needed + * commands. + */ + memset(&restrict_ioc, 0, sizeof(restrict_ioc)); + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, + &restrict_ioc); + errno_save = ret == -1 ? errno : 0; + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + ASSERT_EQ(ret, -1); + + /* ret == -1 */ + if (errno_save == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); + else if (errno_save == ENODEV) + SKIP(return, "System does not support SGX2"); + + /* + * Page that will have its permissions changed is the second data + * page in the .data segment. This forms part of the local encl_buffer + * within the enclave. + * + * At start of test @data_start should have EPCM as well as PTE and + * VMA permissions of RW. + */ + + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before making + * any changes to page permissions. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that + * page is writable. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Change EPCM permissions to read-only. Kernel still considers + * the page writable. + */ + memset(&restrict_ioc, 0, sizeof(restrict_ioc)); + + restrict_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + restrict_ioc.length = PAGE_SIZE; + restrict_ioc.permissions = SGX_SECINFO_R; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, + &restrict_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(restrict_ioc.result, 0); + EXPECT_EQ(restrict_ioc.count, 4096); + + /* + * EPCM permissions changed from kernel, need to EACCEPT from enclave. + */ + eaccept_op.epc_addr = data_start; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_REG | SGX_SECINFO_PR; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * EPCM permissions of page is now read-only, expect #PF + * on EPCM when attempting to write to page from within enclave. + */ + put_addr_op.value = MAGIC2; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x8007); + EXPECT_EQ(self->run.exception_addr, data_start); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Received AEX but cannot return to enclave at same entrypoint, + * need different TCS from where EPCM permission can be made writable + * again. + */ + self->run.tcs = self->encl.encl_base + PAGE_SIZE; + + /* + * Enter enclave at new TCS to change EPCM permissions to be + * writable again and thus fix the page fault that triggered the + * AEX. + */ + + emodpe_op.epc_addr = data_start; + emodpe_op.flags = SGX_SECINFO_R | SGX_SECINFO_W; + emodpe_op.header.type = ENCL_OP_EMODPE; + + EXPECT_EQ(ENCL_CALL(&emodpe_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Attempt to return to main TCS to resume execution at faulting + * instruction, PTE should continue to allow writing to the page. + */ + self->run.tcs = self->encl.encl_base; + + /* + * Wrong page permissions that caused original fault has + * now been fixed via EPCM permissions. + * Resume execution in main TCS to re-attempt the memory access. + */ + self->run.tcs = self->encl.encl_base; + + EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, + ERESUME, 0, 0, + &self->run), + 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC2); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 4fca01cfd898..5b6c65331527 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -11,6 +11,42 @@ */ static uint8_t encl_buffer[8192] = { 1 }; +enum sgx_enclu_function { + EACCEPT = 0x5, + EMODPE = 0x6, +}; + +static void do_encl_emodpe(void *_op) +{ + struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; + struct encl_op_emodpe *op = _op; + + secinfo.flags = op->flags; + + asm volatile(".byte 0x0f, 0x01, 0xd7" + : + : "a" (EMODPE), + "b" (&secinfo), + "c" (op->epc_addr)); +} + +static void do_encl_eaccept(void *_op) +{ + struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; + struct encl_op_eaccept *op = _op; + int rax; + + secinfo.flags = op->flags; + + asm volatile(".byte 0x0f, 0x01, 0xd7" + : "=a" (rax) + : "a" (EACCEPT), + "b" (&secinfo), + "c" (op->epc_addr)); + + op->ret = rax; +} + static void *memcpy(void *dest, const void *src, size_t n) { size_t i; @@ -62,6 +98,8 @@ void encl_body(void *rdi, void *rsi) do_encl_op_put_to_addr, do_encl_op_get_from_addr, do_encl_op_nop, + do_encl_eaccept, + do_encl_emodpe, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- Gitee From 4a7d6c94e2dfd7084a69e08879c869eeaff154ad Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:59 -0700 Subject: [PATCH 116/134] selftests/sgx: Add test for TCS page permission changes commit 7088c81f94733fd5d103f8975d5e1d1fad12f665 upstream. Kernel should not allow permission changes on TCS pages. Add test to confirm this behavior. Intel-SIG: commit 7088c81f9473 selftests/sgx: Add test for TCS page permission changes. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/0121ad1b21befb94519072e2c18b89aa5dca00d4.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 71 ++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 9d5c5959ffb8..3d1e296c5634 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -121,6 +121,24 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) return NULL; } +/* + * Return the offset in the enclave where the TCS segment can be found. + * The first RW segment loaded is the TCS. + */ +static off_t encl_get_tcs_offset(struct encl *encl) +{ + int i; + + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (i == 0 && seg->prot == (PROT_READ | PROT_WRITE)) + return seg->offset; + } + + return -1; +} + /* * Return the offset in the enclave where the data segment can be found. * The first RW segment loaded is the TCS, skip that to get info on the @@ -566,6 +584,59 @@ TEST_F(enclave, pte_permissions) EXPECT_EQ(self->run.exception_addr, 0); } +/* + * Modifying permissions of TCS page should not be possible. + */ +TEST_F(enclave, tcs_permissions) +{ + struct sgx_enclave_restrict_permissions ioc; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + memset(&ioc, 0, sizeof(ioc)); + + /* + * Ensure kernel supports needed ioctl() and system supports needed + * commands. + */ + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); + errno_save = ret == -1 ? errno : 0; + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + ASSERT_EQ(ret, -1); + + /* ret == -1 */ + if (errno_save == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); + else if (errno_save == ENODEV) + SKIP(return, "System does not support SGX2"); + + /* + * Attempt to make TCS page read-only. This is not allowed and + * should be prevented by the kernel. + */ + ioc.offset = encl_get_tcs_offset(&self->encl); + ioc.length = PAGE_SIZE; + ioc.permissions = SGX_SECINFO_R; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno_save, EINVAL); + EXPECT_EQ(ioc.result, 0); + EXPECT_EQ(ioc.count, 0); +} + /* * Enclave page permission test. * -- Gitee From a16933da2566359578b9ebdb2da19eb740ecc427 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:00 -0700 Subject: [PATCH 117/134] selftests/sgx: Test two different SGX2 EAUG flows commit 67f1f70a23d117628d5cfc78bcdf8eb9d2d04874 upstream. Enclave pages can be added to an initialized enclave when an address belonging to the enclave but without a backing page is accessed from within the enclave. Accessing memory without a backing enclave page from within an enclave can be in different ways: 1) Pre-emptively run ENCLU[EACCEPT]. Since the addition of a page always needs to be accepted by the enclave via ENCLU[EACCEPT] this flow is efficient since the first execution of ENCLU[EACCEPT] triggers the addition of the page and when execution returns to the same instruction the second execution would be successful as an acceptance of the page. 2) A direct read or write. The flow where a direct read or write triggers the page addition execution cannot resume from the instruction (read/write) that triggered the fault but instead the enclave needs to be entered at a different entry point to run needed ENCLU[EACCEPT] before execution can return to the original entry point and the read/write instruction that faulted. Add tests for both flows. Intel-SIG: commit 67f1f70a23d1 selftests/sgx: Test two different SGX2 EAUG flows. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/0c321e0e32790ac1de742ce5017a331e6d902ac1.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 250 +++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 3d1e296c5634..73ae108d54d8 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -86,6 +86,15 @@ static bool vdso_get_symtab(void *addr, struct vdso_symtab *symtab) return true; } +static inline int sgx2_supported(void) +{ + unsigned int eax, ebx, ecx, edx; + + __cpuid_count(SGX_CPUID, 0x0, eax, ebx, ecx, edx); + + return eax & 0x2; +} + static unsigned long elf_sym_hash(const char *name) { unsigned long h = 0, high; @@ -839,4 +848,245 @@ TEST_F(enclave, epcm_permissions) EXPECT_EQ(self->run.exception_addr, 0); } +/* + * Test the addition of pages to an initialized enclave via writing to + * a page belonging to the enclave's address space but was not added + * during enclave creation. + */ +TEST_F(enclave, augment) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_eaccept eaccept_op; + size_t total_size = 0; + void *addr; + int i; + + if (!sgx2_supported()) + SKIP(return, "SGX2 not supported"); + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + total_size += seg->size; + } + + /* + * Actual enclave size is expected to be larger than the loaded + * test enclave since enclave size must be a power of 2 in bytes + * and test_encl does not consume it all. + */ + EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); + + /* + * Create memory mapping for the page that will be added. New + * memory mapping is for one page right after all existing + * mappings. + * Kernel will allow new mapping using any permissions if it + * falls into the enclave's address range but not backed + * by existing enclave pages. + */ + addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED | MAP_FIXED, self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Attempt to write to the new page from within enclave. + * Expected to fail since page is not (yet) part of the enclave. + * The first #PF will trigger the addition of the page to the + * enclave, but since the new page needs an EACCEPT from within the + * enclave before it can be used it would not be possible + * to successfully return to the failing instruction. This is the + * cause of the second #PF captured here having the SGX bit set, + * it is from hardware preventing the page from being used. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = (unsigned long)addr; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_addr, (unsigned long)addr); + + if (self->run.exception_error_code == 0x6) { + munmap(addr, PAGE_SIZE); + SKIP(return, "Kernel does not support adding pages to initialized enclave"); + } + + EXPECT_EQ(self->run.exception_error_code, 0x8007); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* Handle AEX by running EACCEPT from new entry point. */ + self->run.tcs = self->encl.encl_base + PAGE_SIZE; + + eaccept_op.epc_addr = self->encl.encl_base + total_size; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Can now return to main TCS to resume execution. */ + self->run.tcs = self->encl.encl_base; + + EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, + ERESUME, 0, 0, + &self->run), + 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory from newly added page that was just written to, + * confirming that data previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = (unsigned long)addr; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + munmap(addr, PAGE_SIZE); +} + +/* + * Test for the addition of pages to an initialized enclave via a + * pre-emptive run of EACCEPT on page to be added. + */ +TEST_F(enclave, augment_via_eaccept) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_eaccept eaccept_op; + size_t total_size = 0; + void *addr; + int i; + + if (!sgx2_supported()) + SKIP(return, "SGX2 not supported"); + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + total_size += seg->size; + } + + /* + * Actual enclave size is expected to be larger than the loaded + * test enclave since enclave size must be a power of 2 in bytes while + * test_encl does not consume it all. + */ + EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); + + /* + * mmap() a page at end of existing enclave to be used for dynamic + * EPC page. + * + * Kernel will allow new mapping using any permissions if it + * falls into the enclave's address range but not backed + * by existing enclave pages. + */ + + addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_FIXED, + self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Run EACCEPT on new page to trigger the #PF->EAUG->EACCEPT(again + * without a #PF). All should be transparent to userspace. + */ + eaccept_op.epc_addr = self->encl.encl_base + total_size; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + if (self->run.exception_vector == 14 && + self->run.exception_error_code == 4 && + self->run.exception_addr == self->encl.encl_base + total_size) { + munmap(addr, PAGE_SIZE); + SKIP(return, "Kernel does not support adding pages to initialized enclave"); + } + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * New page should be accessible from within enclave - attempt to + * write to it. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = (unsigned long)addr; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory from newly added page that was just written to, + * confirming that data previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = (unsigned long)addr; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + munmap(addr, PAGE_SIZE); +} + TEST_HARNESS_MAIN -- Gitee From e0246b021d54a2422292e85851c341499f78a399 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:01 -0700 Subject: [PATCH 118/134] selftests/sgx: Introduce dynamic entry point commit 7eb4370152beb2f1e25543088bce2e3f0621ab81 upstream. The test enclave (test_encl.elf) is built with two initialized Thread Control Structures (TCS) included in the binary. Both TCS are initialized with the same entry point, encl_entry, that correctly computes the absolute address of the stack based on the stack of each TCS that is also built into the binary. A new TCS can be added dynamically to the enclave and requires to be initialized with an entry point used to enter the enclave. Since the existing entry point, encl_entry, assumes that the TCS and its stack exists at particular offsets within the binary it is not able to handle a dynamically added TCS and its stack. Introduce a new entry point, encl_dyn_entry, that initializes the absolute address of that thread's stack to the address immediately preceding the TCS itself. It is now possible to dynamically add a contiguous memory region to the enclave with the new stack preceding the new TCS. With the new TCS initialized with encl_dyn_entry as entry point the absolute address of the stack is computed correctly on entry. Intel-SIG: commit 7eb4370152be selftests/sgx: Introduce dynamic entry point. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/93e9c420dedf5f773ba6965c18245bc7d62aca83.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/test_encl_bootstrap.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S index 82fb0dfcbd23..03ae0f57e29d 100644 --- a/tools/testing/selftests/sgx/test_encl_bootstrap.S +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -45,6 +45,12 @@ encl_entry: # TCS #2. By adding the value of encl_stack to it, we get # the absolute address for the stack. lea (encl_stack)(%rbx), %rax + jmp encl_entry_core +encl_dyn_entry: + # Entry point for dynamically created TCS page expected to follow + # its stack directly. + lea -1(%rbx), %rax +encl_entry_core: xchg %rsp, %rax push %rax -- Gitee From c8a526035e1f7cef5107e43bb0ee3cb223ec52eb Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:02 -0700 Subject: [PATCH 119/134] selftests/sgx: Introduce TCS initialization enclave operation commit b564982fda13be6314e49f2344e7c422565e34d3 upstream. The Thread Control Structure (TCS) contains meta-data used by the hardware to save and restore thread specific information when entering/exiting the enclave. A TCS can be added to an initialized enclave by first adding a new regular enclave page, initializing the content of the new page from within the enclave, and then changing that page's type to a TCS. Support the initialization of a TCS from within the enclave. The variable information needed that should be provided from outside the enclave is the address of the TCS, address of the State Save Area (SSA), and the entry point that the thread should use to enter the enclave. With this information provided all needed fields of a TCS can be initialized. Intel-SIG: commit b564982fda13 selftests/sgx: Introduce TCS initialization enclave operation. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/bad6052056188bde753a54313da1ac8f1e29088a.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/defines.h | 8 +++++++ tools/testing/selftests/sgx/test_encl.c | 30 +++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index b638eb98c80c..d8587c971941 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -26,6 +26,7 @@ enum encl_op_type { ENCL_OP_NOP, ENCL_OP_EACCEPT, ENCL_OP_EMODPE, + ENCL_OP_INIT_TCS_PAGE, ENCL_OP_MAX, }; @@ -68,4 +69,11 @@ struct encl_op_emodpe { uint64_t flags; }; +struct encl_op_init_tcs_page { + struct encl_op_header header; + uint64_t tcs_page; + uint64_t ssa; + uint64_t entry; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 5b6c65331527..c0d6397295e3 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -57,6 +57,35 @@ static void *memcpy(void *dest, const void *src, size_t n) return dest; } +static void *memset(void *dest, int c, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + ((char *)dest)[i] = c; + + return dest; +} + +static void do_encl_init_tcs_page(void *_op) +{ + struct encl_op_init_tcs_page *op = _op; + void *tcs = (void *)op->tcs_page; + uint32_t val_32; + + memset(tcs, 0, 16); /* STATE and FLAGS */ + memcpy(tcs + 16, &op->ssa, 8); /* OSSA */ + memset(tcs + 24, 0, 4); /* CSSA */ + val_32 = 1; + memcpy(tcs + 28, &val_32, 4); /* NSSA */ + memcpy(tcs + 32, &op->entry, 8); /* OENTRY */ + memset(tcs + 40, 0, 24); /* AEP, OFSBASE, OGSBASE */ + val_32 = 0xFFFFFFFF; + memcpy(tcs + 64, &val_32, 4); /* FSLIMIT */ + memcpy(tcs + 68, &val_32, 4); /* GSLIMIT */ + memset(tcs + 72, 0, 4024); /* Reserved */ +} + static void do_encl_op_put_to_buf(void *op) { struct encl_op_put_to_buf *op2 = op; @@ -100,6 +129,7 @@ void encl_body(void *rdi, void *rsi) do_encl_op_nop, do_encl_eaccept, do_encl_emodpe, + do_encl_init_tcs_page, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- Gitee From 4f7851659273ce9b0571b74a7c42949bc80f6151 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:03 -0700 Subject: [PATCH 120/134] selftests/sgx: Test complete changing of page type flow commit 33c5aac3bf32c3ef120ad6d2eb5c65ab64a5fec4 upstream. Support for changing an enclave page's type enables an initialized enclave to be expanded with support for more threads by changing the type of a regular enclave page to that of a Thread Control Structure (TCS). Additionally, being able to change a TCS or regular enclave page's type to be trimmed (SGX_PAGE_TYPE_TRIM) initiates the removal of the page from the enclave. Test changing page type to TCS as well as page removal flows in two phases: In the first phase support for a new thread is dynamically added to an initialized enclave and in the second phase the pages associated with the new thread are removed from the enclave. As an additional sanity check after the second phase the page used as a TCS page during the first phase is added back as a regular page and ensured that it can be written to (which is not possible if it was a TCS page). Intel-SIG: commit 33c5aac3bf32 selftests/sgx: Test complete changing of page type flow. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d05b48b00338683a94dcaef9f478540fc3d6d5f9.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/load.c | 41 ++++ tools/testing/selftests/sgx/main.c | 343 +++++++++++++++++++++++++++++ tools/testing/selftests/sgx/main.h | 1 + 3 files changed, 385 insertions(+) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 9d4322c946e2..41b9d2031799 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -129,6 +129,47 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) return true; } +/* + * Parse the enclave code's symbol table to locate and return address of + * the provided symbol + */ +uint64_t encl_get_entry(struct encl *encl, const char *symbol) +{ + Elf64_Shdr *sections; + Elf64_Sym *symtab; + Elf64_Ehdr *ehdr; + char *sym_names; + int num_sym; + int i; + + ehdr = encl->bin; + sections = encl->bin + ehdr->e_shoff; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sections[i].sh_type == SHT_SYMTAB) { + symtab = (Elf64_Sym *)((char *)encl->bin + sections[i].sh_offset); + num_sym = sections[i].sh_size / sections[i].sh_entsize; + break; + } + } + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sections[i].sh_type == SHT_STRTAB) { + sym_names = (char *)encl->bin + sections[i].sh_offset; + break; + } + } + + for (i = 0; i < num_sym; i++) { + Elf64_Sym *sym = &symtab[i]; + + if (!strcmp(symbol, sym_names + sym->st_name)) + return (uint64_t)sym->st_value; + } + + return 0; +} + bool encl_load(const char *path, struct encl *encl, unsigned long heap_size) { const char device_path[] = "/dev/sgx_enclave"; diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 73ae108d54d8..d283faed018f 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1089,4 +1089,347 @@ TEST_F(enclave, augment_via_eaccept) munmap(addr, PAGE_SIZE); } +/* + * SGX2 page type modification test in two phases: + * Phase 1: + * Create a new TCS, consisting out of three new pages (stack page with regular + * page type, SSA page with regular page type, and TCS page with TCS page + * type) in an initialized enclave and run a simple workload within it. + * Phase 2: + * Remove the three pages added in phase 1, add a new regular page at the + * same address that previously hosted the TCS page and verify that it can + * be modified. + */ +TEST_F(enclave, tcs_create) +{ + struct encl_op_init_tcs_page init_tcs_page_op; + struct sgx_enclave_remove_pages remove_ioc; + struct encl_op_get_from_addr get_addr_op; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_get_from_buf get_buf_op; + struct encl_op_put_to_buf put_buf_op; + void *addr, *tcs, *stack_end, *ssa; + struct encl_op_eaccept eaccept_op; + size_t total_size = 0; + uint64_t val_64; + int errno_save; + int ret, i; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, + _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Add three regular pages via EAUG: one will be the TCS stack, one + * will be the TCS SSA, and one will be the new TCS. The stack and + * SSA will remain as regular pages, the TCS page will need its + * type changed after populated with needed data. + */ + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + total_size += seg->size; + } + + /* + * Actual enclave size is expected to be larger than the loaded + * test enclave since enclave size must be a power of 2 in bytes while + * test_encl does not consume it all. + */ + EXPECT_LT(total_size + 3 * PAGE_SIZE, self->encl.encl_size); + + /* + * mmap() three pages at end of existing enclave to be used for the + * three new pages. + */ + addr = mmap((void *)self->encl.encl_base + total_size, 3 * PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, + self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + stack_end = (void *)self->encl.encl_base + total_size; + tcs = (void *)self->encl.encl_base + total_size + PAGE_SIZE; + ssa = (void *)self->encl.encl_base + total_size + 2 * PAGE_SIZE; + + /* + * Run EACCEPT on each new page to trigger the + * EACCEPT->(#PF)->EAUG->EACCEPT(again without a #PF) flow. + */ + + eaccept_op.epc_addr = (unsigned long)stack_end; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + if (self->run.exception_vector == 14 && + self->run.exception_error_code == 4 && + self->run.exception_addr == (unsigned long)stack_end) { + munmap(addr, 3 * PAGE_SIZE); + SKIP(return, "Kernel does not support adding pages to initialized enclave"); + } + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)ssa; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)tcs; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * Three new pages added to enclave. Now populate the TCS page with + * needed data. This should be done from within enclave. Provide + * the function that will do the actual data population with needed + * data. + */ + + /* + * New TCS will use the "encl_dyn_entry" entrypoint that expects + * stack to begin in page before TCS page. + */ + val_64 = encl_get_entry(&self->encl, "encl_dyn_entry"); + EXPECT_NE(val_64, 0); + + init_tcs_page_op.tcs_page = (unsigned long)tcs; + init_tcs_page_op.ssa = (unsigned long)total_size + 2 * PAGE_SIZE; + init_tcs_page_op.entry = val_64; + init_tcs_page_op.header.type = ENCL_OP_INIT_TCS_PAGE; + + EXPECT_EQ(ENCL_CALL(&init_tcs_page_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Change TCS page type to TCS. */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = total_size + PAGE_SIZE; + modt_ioc.length = PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TCS; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 4096); + + /* EACCEPT new TCS page from enclave. */ + eaccept_op.epc_addr = (unsigned long)tcs; + eaccept_op.flags = SGX_SECINFO_TCS | SGX_SECINFO_MODIFIED; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Run workload from new TCS. */ + self->run.tcs = (unsigned long)tcs; + + /* + * Simple workload to write to data buffer and read value back. + */ + put_buf_op.header.type = ENCL_OP_PUT_TO_BUFFER; + put_buf_op.value = MAGIC; + + EXPECT_EQ(ENCL_CALL(&put_buf_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + get_buf_op.header.type = ENCL_OP_GET_FROM_BUFFER; + get_buf_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_buf_op, &self->run, true), 0); + + EXPECT_EQ(get_buf_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Phase 2 of test: + * Remove pages associated with new TCS, create a regular page + * where TCS page used to be and verify it can be used as a regular + * page. + */ + + /* Start page removal by requesting change of page type to PT_TRIM. */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = total_size; + modt_ioc.length = 3 * PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 3 * PAGE_SIZE); + + /* + * Enter enclave via TCS #1 and approve page removal by sending + * EACCEPT for each of three removed pages. + */ + self->run.tcs = self->encl.encl_base; + + eaccept_op.epc_addr = (unsigned long)stack_end; + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)tcs; + eaccept_op.ret = 0; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)ssa; + eaccept_op.ret = 0; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Send final ioctl() to complete page removal. */ + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = total_size; + remove_ioc.length = 3 * PAGE_SIZE; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(remove_ioc.count, 3 * PAGE_SIZE); + + /* + * Enter enclave via TCS #1 and access location where TCS #3 was to + * trigger dynamic add of regular page at that location. + */ + eaccept_op.epc_addr = (unsigned long)tcs; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * New page should be accessible from within enclave - write to it. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = (unsigned long)tcs; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory from newly added page that was just written to, + * confirming that data previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = (unsigned long)tcs; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + munmap(addr, 3 * PAGE_SIZE); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index b45c52ec7ab3..fc585be97e2f 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -38,6 +38,7 @@ void encl_delete(struct encl *ctx); bool encl_load(const char *path, struct encl *encl, unsigned long heap_size); bool encl_measure(struct encl *encl); bool encl_build(struct encl *encl); +uint64_t encl_get_entry(struct encl *encl, const char *symbol); int sgx_enter_enclave(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, struct sgx_enclave_run *run); -- Gitee From 78631cd1b124626db043f7e88b33889c3d286f2e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:04 -0700 Subject: [PATCH 121/134] selftests/sgx: Test faulty enclave behavior commit 50b822e4b785948ed663c89c84e124fc8c099c9b upstream. Removing a page from an initialized enclave involves three steps: first the user requests changing the page type to SGX_PAGE_TYPE_TRIM via an ioctl(), on success the ENCLU[EACCEPT] instruction needs to be run from within the enclave to accept the page removal, finally the user requests page removal to be completed via an ioctl(). Only after acceptance (ENCLU[EACCEPT]) from within the enclave can the kernel remove the page from a running enclave. Test the behavior when the user's request to change the page type succeeds, but the ENCLU[EACCEPT] instruction is not run before the ioctl() requesting page removal is run. This should not be permitted. Intel-SIG: commit 50b822e4b785 selftests/sgx: Test faulty enclave behavior. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/fa5da30ebac108b7517194c3038b52995602b996.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 114 +++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index d283faed018f..00b2718ddf21 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1432,4 +1432,118 @@ TEST_F(enclave, tcs_create) munmap(addr, 3 * PAGE_SIZE); } +/* + * Ensure sane behavior if user requests page removal, does not run + * EACCEPT from within enclave but still attempts to finalize page removal + * with the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). The latter should fail + * because the removal was not EACCEPTed from within the enclave. + */ +TEST_F(enclave, remove_added_page_no_eaccept) +{ + struct sgx_enclave_remove_pages remove_ioc; + struct encl_op_get_from_addr get_addr_op; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_put_to_addr put_addr_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Page that will be removed is the second data page in the .data + * segment. This forms part of the local encl_buffer within the + * enclave. + */ + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before + * removing it. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that data + * previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Start page removal by requesting change of page type to PT_TRIM */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + modt_ioc.length = PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 4096); + + /* Skip EACCEPT */ + + /* Send final ioctl() to complete page removal */ + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + remove_ioc.length = PAGE_SIZE; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + /* Operation not permitted since EACCEPT was omitted. */ + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno_save, EPERM); + EXPECT_EQ(remove_ioc.count, 0); +} + TEST_HARNESS_MAIN -- Gitee From 64e9ac4fb46aa2d267051134177e60af9c5aa8a5 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:05 -0700 Subject: [PATCH 122/134] selftests/sgx: Test invalid access to removed enclave page commit 35c7e6dacb038e9311e98901d56bb1abd56f9ae0 upstream. Removing a page from an initialized enclave involves three steps: (1) the user requests changing the page type to SGX_PAGE_TYPE_TRIM via the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(), (2) on success the ENCLU[EACCEPT] instruction is run from within the enclave to accept the page removal, (3) the user initiates the actual removal of the page via the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). Test two possible invalid accesses during the page removal flow: * Test the behavior when a request to remove the page by changing its type to SGX_PAGE_TYPE_TRIM completes successfully but instead of executing ENCLU[EACCEPT] from within the enclave the enclave attempts to read from the page. Even though the page is accessible from the page table entries its type is SGX_PAGE_TYPE_TRIM and thus not accessible according to SGX. The expected behavior is a page fault with the SGX flag set in the error code. * Test the behavior when the page type is changed successfully and ENCLU[EACCEPT] was run from within the enclave. The final ioctl(), SGX_IOC_ENCLAVE_REMOVE_PAGES, is omitted and replaced with an attempt to access the page. Even though the page is accessible from the page table entries its type is SGX_PAGE_TYPE_TRIM and thus not accessible according to SGX. The expected behavior is a page fault with the SGX flag set in the error code. Intel-SIG: commit 35c7e6dacb03 selftests/sgx: Test invalid access to removed enclave page. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/189a86c25d6d62da7cfdd08ee97abc1a06fcc179.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 243 +++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 00b2718ddf21..3e0d8902ce55 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1546,4 +1546,247 @@ TEST_F(enclave, remove_added_page_no_eaccept) EXPECT_EQ(remove_ioc.count, 0); } +/* + * Request enclave page removal but instead of correctly following with + * EACCEPT a read attempt to page is made from within the enclave. + */ +TEST_F(enclave, remove_added_page_invalid_access) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct sgx_enclave_modify_types ioc; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&ioc, 0, sizeof(ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Page that will be removed is the second data page in the .data + * segment. This forms part of the local encl_buffer within the + * enclave. + */ + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before + * removing it. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that data + * previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Start page removal by requesting change of page type to PT_TRIM. */ + memset(&ioc, 0, sizeof(ioc)); + + ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + ioc.length = PAGE_SIZE; + ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(ioc.result, 0); + EXPECT_EQ(ioc.count, 4096); + + /* + * Read from page that was just removed. + */ + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + /* + * From kernel perspective the page is present but according to SGX the + * page should not be accessible so a #PF with SGX bit set is + * expected. + */ + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x8005); + EXPECT_EQ(self->run.exception_addr, data_start); +} + +/* + * Request enclave page removal and correctly follow with + * EACCEPT but do not follow with removal ioctl() but instead a read attempt + * to removed page is made from within the enclave. + */ +TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct sgx_enclave_modify_types ioc; + struct encl_op_eaccept eaccept_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&ioc, 0, sizeof(ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Page that will be removed is the second data page in the .data + * segment. This forms part of the local encl_buffer within the + * enclave. + */ + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before + * removing it. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that data + * previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Start page removal by requesting change of page type to PT_TRIM. */ + memset(&ioc, 0, sizeof(ioc)); + + ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + ioc.length = PAGE_SIZE; + ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(ioc.result, 0); + EXPECT_EQ(ioc.count, 4096); + + eaccept_op.epc_addr = (unsigned long)data_start; + eaccept_op.ret = 0; + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Skip ioctl() to remove page. */ + + /* + * Read from page that was just removed. + */ + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + /* + * From kernel perspective the page is present but according to SGX the + * page should not be accessible so a #PF with SGX bit set is + * expected. + */ + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x8005); + EXPECT_EQ(self->run.exception_addr, data_start); +} + TEST_HARNESS_MAIN -- Gitee From 327f8edf0fe2fbf056e5ae71009ed60a059d3251 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:06 -0700 Subject: [PATCH 123/134] selftests/sgx: Test reclaiming of untouched page commit 08ceab2c37d32f422f8d98540656ee5a416ba729 upstream. Removing a page from an initialized enclave involves three steps: (1) the user requests changing the page type to PT_TRIM via the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() (2) on success the ENCLU[EACCEPT] instruction is run from within the enclave to accept the page removal (3) the user initiates the actual removal of the page via the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). Remove a page that has never been accessed. This means that when the first ioctl() requesting page removal arrives, there will be no page table entry, yet a valid page table entry needs to exist for the ENCLU[EACCEPT] function to succeed. In this test it is verified that a page table entry can still be installed for a page that is in the process of being removed. Intel-SIG: commit 08ceab2c37d3 selftests/sgx: Test reclaiming of untouched page. Backport for SGX EDMM support. Suggested-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/45e1b2a2fcd8c14597d04e40af5d8a9c1c5b017e.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 80 ++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 3e0d8902ce55..7cfe80d6dee9 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1789,4 +1789,84 @@ TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) EXPECT_EQ(self->run.exception_addr, data_start); } +TEST_F(enclave, remove_untouched_page) +{ + struct sgx_enclave_remove_pages remove_ioc; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_eaccept eaccept_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* SGX2 is supported by kernel and hardware, test can proceed. */ + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + modt_ioc.length = PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 4096); + + /* + * Enter enclave via TCS #1 and approve page removal by sending + * EACCEPT for removed page. + */ + + eaccept_op.epc_addr = data_start; + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + remove_ioc.length = PAGE_SIZE; + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(remove_ioc.count, 4096); +} + TEST_HARNESS_MAIN -- Gitee From 36aadcf334aa87cc79a5e29a3d9c4ce937e68c50 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:07 -0700 Subject: [PATCH 124/134] selftests/sgx: Page removal stress test commit 6507cce561b43b071999502103804e3dc1478e60 upstream. Create enclave with additional heap that consumes all physical SGX memory and then remove it. Depending on the available SGX memory this test could take a significant time to run (several minutes) as it (1) creates the enclave, (2) changes the type of every page to be trimmed, (3) enters the enclave once per page to run EACCEPT, before (4) the pages are finally removed. Intel-SIG: commit 6507cce561b4 selftests/sgx: Page removal stress test. Backport for SGX EDMM support. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/e7c6aa2ab30cb1c41e52b776958409c06970d168.1652137848.git.reinette.chatre@intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/main.c | 120 +++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 7cfe80d6dee9..e26fd951164b 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -377,7 +377,127 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) EXPECT_EQ(get_op.value, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); +} + +TEST_F_TIMEOUT(enclave, unclobbered_vdso_oversubscribed_remove, 900) +{ + struct sgx_enclave_remove_pages remove_ioc; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_get_from_buf get_op; + struct encl_op_eaccept eaccept_op; + struct encl_op_put_to_buf put_op; + struct encl_segment *heap; + unsigned long total_mem; + int ret, errno_save; + unsigned long addr; + unsigned long i; + + /* + * Create enclave with additional heap that is as big as all + * available physical SGX memory. + */ + total_mem = get_total_epc_mem(); + ASSERT_NE(total_mem, 0); + TH_LOG("Creating an enclave with %lu bytes heap may take a while ...", + total_mem); + ASSERT_TRUE(setup_test_encl(total_mem, &self->encl, _metadata)); + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* SGX2 is supported by kernel and hardware, test can proceed. */ + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + heap = &self->encl.segment_tbl[self->encl.nr_segments - 1]; + + put_op.header.type = ENCL_OP_PUT_TO_BUFFER; + put_op.value = MAGIC; + + EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + get_op.header.type = ENCL_OP_GET_FROM_BUFFER; + get_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); + + EXPECT_EQ(get_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + /* Trim entire heap. */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = heap->offset; + modt_ioc.length = heap->size; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + + TH_LOG("Changing type of %zd bytes to trimmed may take a while ...", + heap->size); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, heap->size); + + /* EACCEPT all removed pages. */ + addr = self->encl.encl_base + heap->offset; + + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + TH_LOG("Entering enclave to run EACCEPT for each page of %zd bytes may take a while ...", + heap->size); + for (i = 0; i < heap->size; i += 4096) { + eaccept_op.epc_addr = addr + i; + eaccept_op.ret = 0; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + ASSERT_EQ(eaccept_op.ret, 0); + ASSERT_EQ(self->run.function, EEXIT); + } + + /* Complete page removal. */ + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = heap->offset; + remove_ioc.length = heap->size; + + TH_LOG("Removing %zd bytes from enclave may take a while ...", + heap->size); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(remove_ioc.count, heap->size); } TEST_F(enclave, clobbered_vdso) -- Gitee From 55f492392f8f7f8c500ea02ddd8468338a493329 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Jul 2022 09:21:24 -0700 Subject: [PATCH 125/134] x86/sgx: Drop 'page_index' from sgx_backing commit e0a5915f1cca21da8ffc0563aea9fa1df5d16fb4 upstream. Storing the 'page_index' value in the sgx_backing struct is dead code and no longer needed. Intel-SIG: commit e0a5915f1cca x86/sgx: Drop 'page_index' from sgx_backing. Backport for SGX EDMM support. Signed-off-by: Sean Christopherson Signed-off-by: Kristen Carlson Accardi Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220708162124.8442-1-kristen@linux.intel.com [ Zhiquan: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 1 - arch/x86/kernel/cpu/sgx/encl.h | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index b13d1c5f2bc7..49fd6fcca26d 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -943,7 +943,6 @@ static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, return PTR_ERR(pcmd); } - backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 8f809959baa9..a65a952116fd 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -79,7 +79,6 @@ struct sgx_va_page { }; struct sgx_backing { - pgoff_t page_index; struct page *contents; struct page *pcmd; unsigned long pcmd_offset; -- Gitee From 48a623d748074dbb318ab989bee8212387fdd3d8 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 12 Aug 2022 11:07:13 -0700 Subject: [PATCH 126/134] selftests/sgx: Ignore OpenSSL 3.0 deprecated functions warning commit 5f4d1fd5b5d3506759b5d9cf20bb5fb5b8bdcab1 upstream. OpenSSL 3.0 deprecates some of the functions used in the SGX selftests, causing build errors on new distros. For now ignore the warnings until support for the functions is no longer available and mark FIXME so that it can be clear this should be removed at some point. Intel-SIG: commit selftests/sgx: Ignore OpenSSL 3.0 deprecated functions warning Incremental backporting patches for SGX on Intel Xeon platform. Signed-off-by: Kristen Carlson Accardi Reviewed-by: Jarkko Sakkinen Signed-off-by: Shuah Khan [ Zhiquan Li: amend commit log. ] Signed-off-by: Zhiquan Li --- tools/testing/selftests/sgx/sigstruct.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index 50c5ab1aa6fa..a07896a46364 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -17,6 +17,12 @@ #include "defines.h" #include "main.h" +/* + * FIXME: OpenSSL 3.0 has deprecated some functions. For now just ignore + * the warnings. + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + struct q1q2_ctx { BN_CTX *bn_ctx; BIGNUM *m; -- Gitee From 668eb4d779dc032b09a5a19d5ae91d9595268233 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 6 Sep 2022 03:02:20 +0300 Subject: [PATCH 127/134] x86/sgx: Do not fail on incomplete sanitization on premature stop of ksgxd commit 133e049a3f8c91b175029fb6a59b6039d5e79cba upstream. Unsanitized pages trigger WARN_ON() unconditionally, which can panic the whole computer, if /proc/sys/kernel/panic_on_warn is set. In sgx_init(), if misc_register() fails or misc_register() succeeds but neither sgx_drv_init() nor sgx_vepc_init() succeeds, then ksgxd will be prematurely stopped. This may leave unsanitized pages, which will result a false warning. Refine __sgx_sanitize_pages() to return: 1. Zero when the sanitization process is complete or ksgxd has been requested to stop. 2. The number of unsanitized pages otherwise. Intel-SIG: commit 133e049a3f8c x86/sgx: Do not fail on incomplete sanitization on premature stop of ksgxd Incremental backporting patches for SGX on Intel Xeon platform. Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list") Reported-by: Paul Menzel Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Reviewed-by: Reinette Chatre Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-sgx/20220825051827.246698-1-jarkko@kernel.org/T/#u Link: https://lkml.kernel.org/r/20220906000221.34286-2-jarkko@kernel.org [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/main.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index ad1ee2df9508..6f1d2d11167f 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -49,9 +49,13 @@ static LIST_HEAD(sgx_dirty_page_list); * Reset post-kexec EPC pages to the uninitialized state. The pages are removed * from the input list, and made available for the page allocator. SECS pages * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise. */ -static void __sgx_sanitize_pages(struct list_head *dirty_page_list) +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) { + unsigned long left_dirty = 0; struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; @@ -59,7 +63,7 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) /* dirty_page_list is thread-local, no need for a lock: */ while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) - return; + return 0; page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); @@ -92,12 +96,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) } else { /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); + left_dirty++; } cond_resched(); } list_splice(&dirty, dirty_page_list); + return left_dirty; } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -395,10 +401,7 @@ static int ksgxd(void *p) * required for SECS pages, whose child pages blocked EREMOVE. */ __sgx_sanitize_pages(&sgx_dirty_page_list); - __sgx_sanitize_pages(&sgx_dirty_page_list); - - /* sanity check: */ - WARN_ON(!list_empty(&sgx_dirty_page_list)); + WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) -- Gitee From 041fced35a03fd3733eccae7657b6150751c976d Mon Sep 17 00:00:00 2001 From: Haitao Huang Date: Tue, 6 Sep 2022 03:02:21 +0300 Subject: [PATCH 128/134] x86/sgx: Handle VA page allocation failure for EAUG on PF. commit 81fa6fd13b5c43601fba8486f2385dbd7c1935e2 upstream. VM_FAULT_NOPAGE is expected behaviour for -EBUSY failure path, when augmenting a page, as this means that the reclaimer thread has been triggered, and the intention is just to round-trip in ring-3, and retry with a new page fault. Intel-SIG: commit 81fa6fd13b5c x86/sgx: Handle VA page allocation failure for EAUG on PF. Incremental backporting patches for SGX on Intel Xeon platform. Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave") Signed-off-by: Haitao Huang Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Reviewed-by: Reinette Chatre Tested-by: Vijay Dhanraj Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220906000221.34286-3-jarkko@kernel.org [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 49fd6fcca26d..fed82cbe1887 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -344,8 +344,11 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, } va_page = sgx_encl_grow(encl, false); - if (IS_ERR(va_page)) + if (IS_ERR(va_page)) { + if (PTR_ERR(va_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; goto err_out_epc; + } if (va_page) list_add(&va_page->list, &encl->va_pages); -- Gitee From 5c4b96da0ee543f25363b5d3aec75fdc30445abb Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 12 Aug 2022 11:18:03 -0700 Subject: [PATCH 129/134] x86/sgx: Improve comments for sgx_encl_lookup/alloc_backing() commit ee56a283988d739c25d2d00ffb22707cb487ab47 upstream. Modify the comments for sgx_encl_lookup_backing() and for sgx_encl_alloc_backing() to indicate that they take a reference which must be dropped with a call to sgx_encl_put_backing(). Make sgx_encl_lookup_backing() static for now, and change the name of sgx_encl_get_backing() to __sgx_encl_get_backing() to make it more clear that sgx_encl_get_backing() is an internal function. Intel-SIG: commit ee56a283988d x86/sgx: Improve comments for sgx_encl_lookup/alloc_backing() Incremental backporting patches for SGX on Intel Xeon platform. Signed-off-by: Kristen Carlson Accardi Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/all/YtUs3MKLzFg+rqEV@zn.tnic/ [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 21 ++++++++++++++------- arch/x86/kernel/cpu/sgx/encl.h | 2 -- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index fed82cbe1887..5a8c2913d4fa 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,9 @@ #include "encls.h" #include "sgx.h" +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) /* * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to @@ -917,7 +920,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, } /** - * sgx_encl_get_backing() - Pin the backing storage + * __sgx_encl_get_backing() - Pin the backing storage * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -929,7 +932,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); @@ -1004,7 +1007,7 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) } /** - * sgx_encl_alloc_backing() - allocate a new backing storage page + * sgx_encl_alloc_backing() - create a new backing storage page * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -1012,7 +1015,9 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) * When called from ksgxd, sets the active memcg from one of the * mms in the enclave's mm_list prior to any backing page allocation, * in order to ensure that shmem page allocations are charged to the - * enclave. + * enclave. Create a backing page for loading data back into an EPC page with + * ELDU. This function takes a reference on a new backing page which + * must be dropped with a corresponding call to sgx_encl_put_backing(). * * Return: * 0 on success, @@ -1025,7 +1030,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, struct mem_cgroup *memcg = set_active_memcg(encl_memcg); int ret; - ret = sgx_encl_get_backing(encl, page_index, backing); + ret = __sgx_encl_get_backing(encl, page_index, backing); set_active_memcg(memcg); mem_cgroup_put(encl_memcg); @@ -1043,15 +1048,17 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, * It is the caller's responsibility to ensure that it is appropriate to use * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is * not used correctly, this will cause an allocation which is not accounted for. + * This function takes a reference on an existing backing page which must be + * dropped with a corresponding call to sgx_encl_put_backing(). * * Return: * 0 on success, * -errno otherwise. */ -int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - return sgx_encl_get_backing(encl, page_index, backing); + return __sgx_encl_get_backing(encl, page_index, backing); } /** diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index a65a952116fd..f94ff14c9486 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,8 +107,6 @@ bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); -int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); void sgx_encl_put_backing(struct sgx_backing *backing); -- Gitee From 6f44bbe315e440425202f3280cf54aad565da6f0 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 31 Oct 2022 10:29:58 -0700 Subject: [PATCH 130/134] x86/sgx: Reduce delay and interference of enclave release commit 7b72c823ddf8aaaec4e9fb28e6fbe4d511e7dad1 upstream. commit 8795359e35bc ("x86/sgx: Silence softlockup detection when releasing large enclaves") introduced a cond_resched() during enclave release where the EREMOVE instruction is applied to every 4k enclave page. Giving other tasks an opportunity to run while tearing down a large enclave placates the soft lockup detector but Iqbal found that the fix causes a 25% performance degradation of a workload run using Gramine. Gramine maintains a 1:1 mapping between processes and SGX enclaves. That means if a workload in an enclave creates a subprocess then Gramine creates a duplicate enclave for that subprocess to run in. The consequence is that the release of the enclave used to run the subprocess can impact the performance of the workload that is run in the original enclave, especially in large enclaves when SGX2 is not in use. The workload run by Iqbal behaves as follows: Create enclave (enclave "A") /* Initialize workload in enclave "A" */ Create enclave (enclave "B") /* Run subprocess in enclave "B" and send result to enclave "A" */ Release enclave (enclave "B") /* Run workload in enclave "A" */ Release enclave (enclave "A") The performance impact of releasing enclave "B" in the above scenario is amplified when there is a lot of SGX memory and the enclave size matches the SGX memory. When there is 128GB SGX memory and an enclave size of 128GB, from the time enclave "B" starts the 128GB SGX memory is oversubscribed with a combined demand for 256GB from the two enclaves. Before commit 8795359e35bc ("x86/sgx: Silence softlockup detection when releasing large enclaves") enclave release was done in a tight loop without giving other tasks a chance to run. Even though the system experienced soft lockups the workload (run in enclave "A") obtained good performance numbers because when the workload started running there was no interference. Commit 8795359e35bc ("x86/sgx: Silence softlockup detection when releasing large enclaves") gave other tasks opportunity to run while an enclave is released. The impact of this in this scenario is that while enclave "B" is released and needing to access each page that belongs to it in order to run the SGX EREMOVE instruction on it, enclave "A" is attempting to run the workload needing to access the enclave pages that belong to it. This causes a lot of swapping due to the demand for the oversubscribed SGX memory. Longer latencies are experienced by the workload in enclave "A" while enclave "B" is released. Improve the performance of enclave release while still avoiding the soft lockup detector with two enhancements: - Only call cond_resched() after XA_CHECK_SCHED iterations. - Use the xarray advanced API to keep the xarray locked for XA_CHECK_SCHED iterations instead of locking and unlocking at every iteration. This batching solution is copied from sgx_encl_may_map() that also iterates through all enclave pages using this technique. With this enhancement the workload experiences a 5% performance degradation when compared to a kernel without commit 8795359e35bc ("x86/sgx: Silence softlockup detection when releasing large enclaves"), an improvement to the reported 25% degradation, while still placating the soft lockup detector. Scenarios with poor performance are still possible even with these enhancements. For example, short workloads creating sub processes while running in large enclaves. Further performance improvements are pursued in user space through avoiding to create duplicate enclaves for certain sub processes, and using SGX2 that will do lazy allocation of pages as needed so enclaves created for sub processes start quickly and release quickly. Intel-SIG: commit 7b72c823ddf8 x86/sgx: Reduce delay and interference of enclave release Incremental backporting patches for SGX on Intel Xeon platform. Fixes: 8795359e35bc ("x86/sgx: Silence softlockup detection when releasing large enclaves") Reported-by: Md Iqbal Hossain Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Tested-by: Md Iqbal Hossain Link: https://lore.kernel.org/all/00efa80dd9e35dc85753e1c5edb0344ac07bb1f0.1667236485.git.reinette.chatre%40intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/encl.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 5a8c2913d4fa..edc9e1e44c17 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -680,11 +680,15 @@ const struct vm_operations_struct sgx_vm_ops = { void sgx_encl_release(struct kref *ref) { struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1); struct sgx_va_page *va_page; struct sgx_encl_page *entry; - unsigned long index; + unsigned long count = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base)); - xa_for_each(&encl->page_array, index, entry) { + xas_lock(&xas); + xas_for_each(&xas, entry, max_page_index) { if (entry->epc_page) { /* * The page and its radix tree entry cannot be freed @@ -699,9 +703,20 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); - /* Invoke scheduler to prevent soft lockups. */ - cond_resched(); + /* + * Invoke scheduler on every XA_CHECK_SCHED iteration + * to prevent soft lockups. + */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + + cond_resched(); + + xas_lock(&xas); + } } + xas_unlock(&xas); xa_destroy(&encl->page_array); -- Gitee From 9d66387b231da97135c88a7b9e230db57adc2ce8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 20 Jul 2022 12:13:47 -0700 Subject: [PATCH 131/134] x86/sgx: Allow enclaves to use Asynchrounous Exit Notification commit 370839c241f7b98c66063c2892795a37ee3d2771 upstream. Short Version: Allow enclaves to use the new Asynchronous EXit (AEX) notification mechanism. This mechanism lets enclaves run a handler after an AEX event. These handlers can run mitigations for things like SGX-Step[1]. AEX Notify will be made available both on upcoming processors and on some older processors through microcode updates. Long Version: == SGX Attribute Background == The SGX architecture includes a list of SGX "attributes". These attributes ensure consistency and transparency around specific enclave features. As a simple example, the "DEBUG" attribute allows an enclave to be debugged, but also destroys virtually all of SGX security. Using attributes, enclaves can know that they are being debugged. Attributes also affect enclave attestation so an enclave can, for instance, be denied access to secrets while it is being debugged. The kernel keeps a list of known attributes and will only initialize enclaves that use a known set of attributes. This kernel policy eliminates the chance that a new SGX attribute could cause undesired effects. For example, imagine a new attribute was added called "PROVISIONKEY2" that provided similar functionality to "PROVISIIONKEY". A kernel policy that allowed indiscriminate use of unknown attributes and thus PROVISIONKEY2 would undermine the existing kernel policy which limits use of PROVISIONKEY enclaves. == AEX Notify Background == "Intel Architecture Instruction Set Extensions and Future Features - Version 45" is out[2]. There is a new chapter: Asynchronous Enclave Exit Notify and the EDECCSSA User Leaf Function. Enclaves exit can be either synchronous and consensual (EEXIT for instance) or asynchronous (on an interrupt or fault). The asynchronous ones can evidently be exploited to single step enclaves[1], on top of which other naughty things can be built. AEX Notify will be made available both on upcoming processors and on some older processors through microcode updates. == The Problem == These attacks are currently entirely opaque to the enclave since the hardware does the save/restore under the covers. The Asynchronous Enclave Exit Notify (AEX Notify) mechanism provides enclaves an ability to detect and mitigate potential exposure to these kinds of attacks. == The Solution == Define the new attribute value for AEX Notification. Ensure the attribute is cleared from the list reserved attributes. Instead of adding to the open-coded lists of individual attributes, add named lists of privileged (disallowed by default) and unprivileged (allowed by default) attributes. Add the AEX notify attribute as an unprivileged attribute, which will keep the kernel from rejecting enclaves with it set. 1. https://github.com/jovanbulck/sgx-step 2. https://cdrdv2.intel.com/v1/dl/getContent/671368?explicitVersion=true Intel-SIG: commit 370839c241f7 x86/sgx: Allow enclaves to use Asynchrounous Exit Notification Incremental backporting patches for SGX on Intel Xeon platform. Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Tested-by: Haitao Huang Tested-by: Kai Huang Link: https://lore.kernel.org/all/20220720191347.1343986-1-dave.hansen%40linux.intel.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/sgx.h | 33 ++++++++++++++++++++++++++------- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- arch/x86/kvm/cpuid.c | 4 +--- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 460201cbd857..a76c2ef0a037 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -97,17 +97,36 @@ enum sgx_miscselect { * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to * sign cryptographic tokens that can be passed to * EINIT as an authorization to run an enclave. + * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an + * asynchronous exit has occurred. */ enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + /* BIT(3) is reserved */ + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + /* BIT(6) is for CET */ + SGX_ATTR_KSS = BIT(7), + /* BIT(8) is reserved */ + /* BIT(9) is reserved */ + SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10), }; -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \ + BIT_ULL(6) | \ + BIT_ULL(8) | \ + BIT_ULL(9) | \ + GENMASK_ULL(63, 11)) + +#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \ + SGX_ATTR_MODE64BIT | \ + SGX_ATTR_KSS | \ + SGX_ATTR_ASYNC_EXIT_NOTIFY) + +#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \ + SGX_ATTR_EINITTOKENKEY) /** * struct sgx_secs - SGX Enclave Control Structure (SECS) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index c69a19e7817d..e7cfe100ad79 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -111,7 +111,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; - encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + encl->attributes_mask = SGX_ATTR_UNPRIV_MASK; /* Set only after completion, as encl->lock has not been taken. */ set_bit(SGX_ENCL_CREATED, &encl->flags); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 97673552ee4b..5f36c2c9dbd1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -792,9 +792,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is * expected to derive it from supported XCR0. */ - entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | - SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | - SGX_ATTR_KSS; + entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; entry->ebx &= 0; break; /* Intel PT */ -- Gitee From 26d082f1e1c2b9267f0e194f9d33076e33aa6f38 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 1 Nov 2022 15:24:22 +1300 Subject: [PATCH 132/134] KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest commit 16a7fe3728a8b832ef0d1add66875a666b1f24fc upstream. The new Asynchronous Exit (AEX) notification mechanism (AEX-notify) allows one enclave to receive a notification in the ERESUME after the enclave exit due to an AEX. EDECCSSA is a new SGX user leaf function (ENCLU[EDECCSSA]) to facilitate the AEX notification handling. The new EDECCSSA is enumerated via CPUID(EAX=0x12,ECX=0x0):EAX[11]. Besides Allowing reporting the new AEX-notify attribute to KVM guests, also allow reporting the new EDECCSSA user leaf function to KVM guests so the guest can fully utilize the AEX-notify mechanism. Similar to existing X86_FEATURE_SGX1 and X86_FEATURE_SGX2, introduce a new scattered X86_FEATURE_SGX_EDECCSSA bit for the new EDECCSSA, and report it in KVM's supported CPUIDs. Note, no additional KVM enabling is required to allow the guest to use EDECCSSA. It's impossible to trap ENCLU (without completely preventing the guest from using SGX). Advertise EDECCSSA as supported purely so that userspace doesn't need to special case EDECCSSA, i.e. doesn't need to manually check host CPUID. The inability to trap ENCLU also means that KVM can't prevent the guest from using EDECCSSA, but that virtualization hole is benign as far as KVM is concerned. EDECCSSA is simply a fancy way to modify internal enclave state. More background about how do AEX-notify and EDECCSSA work: SGX maintains a Current State Save Area Frame (CSSA) for each enclave thread. When AEX happens, the enclave thread context is saved to the CSSA and the CSSA is increased by 1. For a normal ERESUME which doesn't deliver AEX notification, it restores the saved thread context from the previously saved SSA and decreases the CSSA. If AEX-notify is enabled for one enclave, the ERESUME acts differently. Instead of restoring the saved thread context and decreasing the CSSA, it acts like EENTER which doesn't decrease the CSSA but establishes a clean slate thread context using the CSSA for the enclave to handle the notification. After some handling, the enclave must discard the "new-established" SSA and switch back to the previously saved SSA (upon AEX). Otherwise, the enclave will run out of SSA space upon further AEXs and eventually fail to run. To solve this problem, the new EDECCSSA essentially decreases the CSSA. It can be used by the enclave notification handler to switch back to the previous saved SSA when needed, i.e. after it handles the notification. Intel-SIG: commit 16a7fe3728a8 KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest Incremental backporting patches for SGX on Intel Xeon platform. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Acked-by: Sean Christopherson Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20221101022422.858944-1-kai.huang%40intel.com [ Zhiquan: amend commit log and resolve the conflict. commit 013380782d4d ("KVM: x86: Move reverse CPUID helpers to separate header file") moved part of content from arch/x86/kvm/cpuid.h to arch/x86/kvm/reverse_cpuid.h. The modifications have been applied on arch/x86/kvm/reverse_cpuid.h should be moved to arch/x86/kvm/cpuid.h. ] Signed-off-by: Zhiquan Li --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 1 + arch/x86/kernel/cpu/scattered.c | 1 + arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 3 +++ 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 9683ca0a39fd..c6955d1f7de4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -290,6 +290,7 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b4de14be1509..0753d6b9828f 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index b8386e59f8d5..50f1a7e69829 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -38,6 +38,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5f36c2c9dbd1..3f551ceb5127 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -448,7 +448,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_scattered(CPUID_12_EAX, - SF(SGX1) | SF(SGX2) + SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); kvm_cpu_cap_mask(CPUID_8000_0001_ECX, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 093f913d55a1..95368f8a7edd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -23,6 +23,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ #define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) +#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); @@ -108,6 +109,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) + return KVM_X86_FEATURE_SGX_EDECCSSA; return x86_feature; } -- Gitee From 9f2c581a0385e9ff69f045c61feb548507a685de Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: [PATCH 133/134] tools headers cpufeatures: Sync with the kernel sources commit 51c4f2bf5397b34b79a6712221606e0ab2e6f7ed upstream. To pick the changes from: 5e85c4ebf206e50c ("x86: KVM: Advertise AVX-IFMA CPUID to user space") af2872f622547656 ("x86: KVM: Advertise AMX-FP16 CPUID to user space") 6a19d7aa5821522e ("x86: KVM: Advertise CMPccXADD CPUID to user space") aaa65d17eec372c6 ("x86/tsx: Add a feature bit for TSX control MSR support") b1599915f09157e9 ("x86/cpufeatures: Move X86_FEATURE_CALL_DEPTH from bit 18 to bit 19 of word 11, to leave space for WIP X86_FEATURE_SGX_EDECCSSA bit") 16a7fe3728a8b832 ("KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest") 80e4c1cd42fff110 ("x86/retbleed: Add X86_FEATURE_CALL_DEPTH") 7df548840c496b01 ("x86/bugs: Add "unknown" reporting for MMIO Stale Data") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Intel-SIG: commit 51c4f2bf5397 tools headers cpufeatures: Sync with the kernel sources Incremental backporting patches for SGX on Intel Xeon platform. Cc: Adrian Hunter Cc: Borislav Petkov Cc: Chang S. Bae Cc: Dave Hansen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiaxi Chen Cc: Jiri Olsa Cc: Kai Huang Cc: Namhyung Kim Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/Y6CD%2FIcEbDW5X%2FpN@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Zhiquan Li --- tools/arch/x86/include/asm/cpufeatures.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 0652d3eed9bd..22ce10ab4802 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -284,9 +284,15 @@ #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -- Gitee From 0f03bdd33ac82ffe0b5609008667817956e08e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Borys=20Pop=C5=82awski?= Date: Wed, 5 Oct 2022 00:59:03 +0200 Subject: [PATCH 134/134] x86/sgx: Add overflow check in sgx_validate_offset_length() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f0861f49bd946ff94fce4f82509c45e167f63690 upstream. sgx_validate_offset_length() function verifies "offset" and "length" arguments provided by userspace, but was missing an overflow check on their addition. Add it. Intel-SIG: commit f0861f49bd94 x86/sgx: Add overflow check in sgx_validate_offset_length() Incremental backporting patches for SGX on Intel Xeon platform. Fixes: c6d26d370767 ("x86/sgx: Add SGX_IOC_ENCLAVE_ADD_PAGES") Signed-off-by: Borys Popławski Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Cc: stable@vger.kernel.org # v5.11+ Link: https://lore.kernel.org/r/0d91ac79-6d84-abed-5821-4dbe59fa1a38@invisiblethingslab.com [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- arch/x86/kernel/cpu/sgx/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index e7cfe100ad79..688bd8620d07 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -356,6 +356,9 @@ static int sgx_validate_offset_length(struct sgx_encl *encl, if (!length || !IS_ALIGNED(length, PAGE_SIZE)) return -EINVAL; + if (offset + length < offset) + return -EINVAL; + if (offset + length - PAGE_SIZE >= encl->size) return -EINVAL; -- Gitee